library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.4
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts ---------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Background and Overview

DataCamp offer interactive courses related to R Programming. While some is review, it is helpful to see other perspectives on material. As well, DataCamp has some interesting materials on packages that I want to learn better (ggplot2, dplyr, ggvis, etc.). This document summarizes a few key insights from:

This document is currently split between _v003 and _v003_a and _v003_b and _v003_c and _v003_d due to the need to keep the number of DLL that it opens below the hard-coded maximum. This introductory section needs to be re-written, and the contents consolidated, at a future date.

The original DataCamp_Insights_v001 and DataCamp_Insights_v002 documents have been split for this document:


Data Manipulation with dplyr in R

Chapter 1 - Transforming Data with dplyr

Counties Dataset:

  • Four main dplyr verbs are select, filter, arrange, mutate
  • The dataset will be US census data at the county level
    • dplyr::glimpse(counties)
    • counties_selected <- counties %>% select(state, county, population, unemployment)

Filter and Arrange Verbs:

  • Can use arrange() to sort the data by the given field(s) - can wrap the variable in desc() to sort descending
  • can use filter() to filter based on a condition that returns a boolean

Mutate:

  • The mutate() verb allows for creating new variables from the existing variables

Example code includes:

counties <- readRDS("./RInputFiles/counties.rds")
babynames <- readRDS("./RInputFiles/babynames.rds")
str(counties)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 3138 obs. of  40 variables:
##  $ census_id         : chr  "1001" "1003" "1005" "1007" ...
##  $ state             : chr  "Alabama" "Alabama" "Alabama" "Alabama" ...
##  $ county            : chr  "Autauga" "Baldwin" "Barbour" "Bibb" ...
##  $ region            : chr  "South" "South" "South" "South" ...
##  $ metro             : chr  "Metro" "Metro" "Nonmetro" "Metro" ...
##  $ population        : num  55221 195121 26932 22604 57710 ...
##  $ men               : num  26745 95314 14497 12073 28512 ...
##  $ women             : num  28476 99807 12435 10531 29198 ...
##  $ hispanic          : num  2.6 4.5 4.6 2.2 8.6 4.4 1.2 3.5 0.4 1.5 ...
##  $ white             : num  75.8 83.1 46.2 74.5 87.9 22.2 53.3 73 57.3 91.7 ...
##  $ black             : num  18.5 9.5 46.7 21.4 1.5 70.7 43.8 20.3 40.3 4.8 ...
##  $ native            : num  0.4 0.6 0.2 0.4 0.3 1.2 0.1 0.2 0.2 0.6 ...
##  $ asian             : num  1 0.7 0.4 0.1 0.1 0.2 0.4 0.9 0.8 0.3 ...
##  $ pacific           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ citizens          : num  40725 147695 20714 17495 42345 ...
##  $ income            : num  51281 50254 32964 38678 45813 ...
##  $ income_err        : num  2391 1263 2973 3995 3141 ...
##  $ income_per_cap    : num  24974 27317 16824 18431 20532 ...
##  $ income_per_cap_err: num  1080 711 798 1618 708 ...
##  $ poverty           : num  12.9 13.4 26.7 16.8 16.7 24.6 25.4 20.5 21.6 19.2 ...
##  $ child_poverty     : num  18.6 19.2 45.3 27.9 27.2 38.4 39.2 31.6 37.2 30.1 ...
##  $ professional      : num  33.2 33.1 26.8 21.5 28.5 18.8 27.5 27.3 23.3 29.3 ...
##  $ service           : num  17 17.7 16.1 17.9 14.1 15 16.6 17.7 14.5 16 ...
##  $ office            : num  24.2 27.1 23.1 17.8 23.9 19.7 21.9 24.2 26.3 19.5 ...
##  $ construction      : num  8.6 10.8 10.8 19 13.5 20.1 10.3 10.5 11.5 13.7 ...
##  $ production        : num  17.1 11.2 23.1 23.7 19.9 26.4 23.7 20.4 24.4 21.5 ...
##  $ drive             : num  87.5 84.7 83.8 83.2 84.9 74.9 84.5 85.3 85.1 83.9 ...
##  $ carpool           : num  8.8 8.8 10.9 13.5 11.2 14.9 12.4 9.4 11.9 12.1 ...
##  $ transit           : num  0.1 0.1 0.4 0.5 0.4 0.7 0 0.2 0.2 0.2 ...
##  $ walk              : num  0.5 1 1.8 0.6 0.9 5 0.8 1.2 0.3 0.6 ...
##  $ other_transp      : num  1.3 1.4 1.5 1.5 0.4 1.7 0.6 1.2 0.4 0.7 ...
##  $ work_at_home      : num  1.8 3.9 1.6 0.7 2.3 2.8 1.7 2.7 2.1 2.5 ...
##  $ mean_commute      : num  26.5 26.4 24.1 28.8 34.9 27.5 24.6 24.1 25.1 27.4 ...
##  $ employed          : num  23986 85953 8597 8294 22189 ...
##  $ private_work      : num  73.6 81.5 71.8 76.8 82 79.5 77.4 74.1 85.1 73.1 ...
##  $ public_work       : num  20.9 12.3 20.8 16.1 13.5 15.1 16.2 20.8 12.1 18.5 ...
##  $ self_employed     : num  5.5 5.8 7.3 6.7 4.2 5.4 6.2 5 2.8 7.9 ...
##  $ family_work       : num  0 0.4 0.1 0.4 0.4 0 0.2 0.1 0 0.5 ...
##  $ unemployment      : num  7.6 7.5 17.6 8.3 7.7 18 10.9 12.3 8.9 7.9 ...
##  $ land_area         : num  594 1590 885 623 645 ...
str(babynames)
## Classes 'tbl_df', 'tbl' and 'data.frame':    332595 obs. of  3 variables:
##  $ year  : num  1880 1880 1880 1880 1880 1880 1880 1880 1880 1880 ...
##  $ name  : chr  "Aaron" "Ab" "Abbie" "Abbott" ...
##  $ number: int  102 5 71 5 6 50 9 12 27 81 ...
# Select the columns 
counties %>%
    select(state, county, population, poverty)
## # A tibble: 3,138 x 4
##    state   county   population poverty
##    <chr>   <chr>         <dbl>   <dbl>
##  1 Alabama Autauga       55221    12.9
##  2 Alabama Baldwin      195121    13.4
##  3 Alabama Barbour       26932    26.7
##  4 Alabama Bibb          22604    16.8
##  5 Alabama Blount        57710    16.7
##  6 Alabama Bullock       10678    24.6
##  7 Alabama Butler        20354    25.4
##  8 Alabama Calhoun      116648    20.5
##  9 Alabama Chambers      34079    21.6
## 10 Alabama Cherokee      26008    19.2
## # ... with 3,128 more rows
counties_selected <- counties %>%
    select(state, county, population, private_work, public_work, self_employed)

# Add a verb to sort in descending order of public_work
counties_selected %>%
    arrange(desc(public_work))
## # A tibble: 3,138 x 6
##    state     county            population private_work public_work self_employed
##    <chr>     <chr>                  <dbl>        <dbl>       <dbl>         <dbl>
##  1 Hawaii    Kalawao                   85         25          64.1          10.9
##  2 Alaska    Yukon-Koyukuk Ce~       5644         33.3        61.7           5.1
##  3 Wisconsin Menominee               4451         36.8        59.1           3.7
##  4 North Da~ Sioux                   4380         32.9        56.8          10.2
##  5 South Da~ Todd                    9942         34.4        55             9.8
##  6 Alaska    Lake and Peninsu~       1474         42.2        51.6           6.1
##  7 Californ~ Lassen                 32645         42.6        50.5           6.8
##  8 South Da~ Buffalo                 2038         48.4        49.5           1.8
##  9 South Da~ Dewey                   5579         34.9        49.2          14.7
## 10 Texas     Kenedy                   565         51.9        48.1           0  
## # ... with 3,128 more rows
counties_selected <- counties %>%
    select(state, county, population)

# Filter for counties in the state of California that have a population above 1000000
counties_selected %>%
    filter(state=="California", population > 1000000)
## # A tibble: 9 x 3
##   state      county         population
##   <chr>      <chr>               <dbl>
## 1 California Alameda           1584983
## 2 California Contra Costa      1096068
## 3 California Los Angeles      10038388
## 4 California Orange            3116069
## 5 California Riverside         2298032
## 6 California Sacramento        1465832
## 7 California San Bernardino    2094769
## 8 California San Diego         3223096
## 9 California Santa Clara       1868149
counties_selected <- counties %>%
    select(state, county, population, private_work, public_work, self_employed)

# Filter for Texas and more than 10000 people; sort in descending order of private_work
counties_selected %>%
    filter(state=="Texas", population > 10000) %>%
    arrange(desc(private_work))
## # A tibble: 169 x 6
##    state county  population private_work public_work self_employed
##    <chr> <chr>        <dbl>        <dbl>       <dbl>         <dbl>
##  1 Texas Gregg       123178         84.7         9.8           5.4
##  2 Texas Collin      862215         84.1        10             5.8
##  3 Texas Dallas     2485003         83.9         9.5           6.4
##  4 Texas Harris     4356362         83.4        10.1           6.3
##  5 Texas Andrews      16775         83.1         9.6           6.8
##  6 Texas Tarrant    1914526         83.1        11.4           5.4
##  7 Texas Titus        32553         82.5        10             7.4
##  8 Texas Denton      731851         82.2        11.9           5.7
##  9 Texas Ector       149557         82          11.2           6.7
## 10 Texas Moore        22281         82          11.7           5.9
## # ... with 159 more rows
counties_selected <- counties %>%
    select(state, county, population, public_work)

# Sort in descending order of the public_workers column
counties_selected %>%
    mutate(public_workers = public_work * population / 100) %>%
    arrange(desc(public_workers))
## # A tibble: 3,138 x 5
##    state      county         population public_work public_workers
##    <chr>      <chr>               <dbl>       <dbl>          <dbl>
##  1 California Los Angeles      10038388        11.5       1154415.
##  2 Illinois   Cook              5236393        11.5        602185.
##  3 California San Diego         3223096        14.8        477018.
##  4 Arizona    Maricopa          4018143        11.7        470123.
##  5 Texas      Harris            4356362        10.1        439993.
##  6 New York   Kings             2595259        14.4        373717.
##  7 California San Bernardino    2094769        16.7        349826.
##  8 California Riverside         2298032        14.9        342407.
##  9 California Sacramento        1465832        21.8        319551.
## 10 California Orange            3116069        10.2        317839.
## # ... with 3,128 more rows
# Select the columns state, county, population, men, and women
counties_selected <- counties %>%
    select(state, county, population, men, women)

# Calculate proportion_women as the fraction of the population made up of women
counties_selected %>%
    mutate(proportion_women = women / population)
## # A tibble: 3,138 x 6
##    state   county   population   men women proportion_women
##    <chr>   <chr>         <dbl> <dbl> <dbl>            <dbl>
##  1 Alabama Autauga       55221 26745 28476            0.516
##  2 Alabama Baldwin      195121 95314 99807            0.512
##  3 Alabama Barbour       26932 14497 12435            0.462
##  4 Alabama Bibb          22604 12073 10531            0.466
##  5 Alabama Blount        57710 28512 29198            0.506
##  6 Alabama Bullock       10678  5660  5018            0.470
##  7 Alabama Butler        20354  9502 10852            0.533
##  8 Alabama Calhoun      116648 56274 60374            0.518
##  9 Alabama Chambers      34079 16258 17821            0.523
## 10 Alabama Cherokee      26008 12975 13033            0.501
## # ... with 3,128 more rows
counties %>%
    # Select the five columns 
    select(state, county, population, men, women) %>%
    # Add the proportion_men variable
    mutate(proportion_men = men/population) %>%
    # Filter for population of at least 10,000
    filter(population >= 10000) %>%
    # Arrange proportion of men in descending order 
    arrange(desc(proportion_men))
## # A tibble: 2,437 x 6
##    state      county         population   men women proportion_men
##    <chr>      <chr>               <dbl> <dbl> <dbl>          <dbl>
##  1 Virginia   Sussex              11864  8130  3734          0.685
##  2 California Lassen              32645 21818 10827          0.668
##  3 Georgia    Chattahoochee       11914  7940  3974          0.666
##  4 Louisiana  West Feliciana      15415 10228  5187          0.664
##  5 Florida    Union               15191  9830  5361          0.647
##  6 Texas      Jones               19978 12652  7326          0.633
##  7 Missouri   DeKalb              12782  8080  4702          0.632
##  8 Texas      Madison             13838  8648  5190          0.625
##  9 Virginia   Greensville         11760  7303  4457          0.621
## 10 Texas      Anderson            57915 35469 22446          0.612
## # ... with 2,427 more rows

Chapter 2 - Aggregating Data

Count Verb:

  • The count() verb will give a count by specific grouping of values
    • count() will give a total count of the rows in the data
    • count(state) will give total count of rows by state
  • Can add a sort capability to get descending (most common first)
    • count(state, sort=TRUE)
  • Can add a weighting capability to the count()
    • count(state, wt=population, sort=TRUE) # wt=population means that population will be summed by state, rather than returning a count of rows

Group By, Summarize, and Ungroup:

  • Can use summarize() to create aggregate statistics such as median(), sum(), max(), n() and the like
  • The group_by(var) command means that any summarize or other action will act on each grouping of var, rather than on the aggregate dataset
  • Can then add an ungroup() to remove the most recent group_by() and revert to regular processing of the data

The top_n verb:

  • The top_n() function is useful for keeping the most extreme observations from each group
    • top_n(1, population) will pull the largest population for each group_by level

Example code includes:

# Use count to find the number of counties in each region
counties %>%
    count(region, sort=TRUE)
## # A tibble: 4 x 2
##   region            n
##   <chr>         <int>
## 1 South          1420
## 2 North Central  1054
## 3 West            447
## 4 Northeast       217
# Find number of counties per state, weighted by citizens
counties %>%
    count(state, wt=citizens, sort=TRUE)
## # A tibble: 50 x 2
##    state                 n
##    <chr>             <dbl>
##  1 California     24280349
##  2 Texas          16864864
##  3 Florida        13933052
##  4 New York       13531404
##  5 Pennsylvania    9710416
##  6 Illinois        8979999
##  7 Ohio            8709050
##  8 Michigan        7380136
##  9 North Carolina  7107998
## 10 Georgia         6978660
## # ... with 40 more rows
counties %>%
    # Add population_walk containing the total number of people who walk to work 
    mutate(population_walk = walk * population / 100) %>%
    # Count weighted by the new column
    count(state, wt=population_walk, sort=TRUE)
## # A tibble: 50 x 2
##    state                n
##    <chr>            <dbl>
##  1 New York      1237938.
##  2 California    1017964.
##  3 Pennsylvania   505397.
##  4 Texas          430783.
##  5 Illinois       400346.
##  6 Massachusetts  316765.
##  7 Florida        284723.
##  8 New Jersey     273047.
##  9 Ohio           266911.
## 10 Washington     239764.
## # ... with 40 more rows
# Summarize to find minimum population, maximum unexployment, and average income
counties %>%
    summarize(min_population=min(population), 
              max_unemployment=max(unemployment), 
              average_income=mean(income)
              )
## # A tibble: 1 x 3
##   min_population max_unemployment average_income
##            <dbl>            <dbl>          <dbl>
## 1             85             29.4         46832.
# Add a density column, then sort in descending order
counties %>%
    group_by(state) %>%
    summarize(total_area = sum(land_area), total_population = sum(population)) %>%
    mutate(density = total_population / total_area) %>%
    arrange(desc(density))
## # A tibble: 50 x 4
##    state         total_area total_population density
##    <chr>              <dbl>            <dbl>   <dbl>
##  1 New Jersey         7354.          8904413   1211.
##  2 Rhode Island       1034.          1053661   1019.
##  3 Massachusetts      7800.          6705586    860.
##  4 Connecticut        4842.          3593222    742.
##  5 Maryland           9707.          5930538    611.
##  6 Delaware           1949.           926454    475.
##  7 New York          47126.         19673174    417.
##  8 Florida           53625.         19645772    366.
##  9 Pennsylvania      44743.         12779559    286.
## 10 Ohio              40861.         11575977    283.
## # ... with 40 more rows
# Calculate the average_pop and median_pop columns 
counties %>%
    group_by(region, state) %>%
    summarize(total_pop = sum(population)) %>%
    summarize(average_pop = mean(total_pop), median_pop=median(total_pop))
## # A tibble: 4 x 3
##   region        average_pop median_pop
##   <chr>               <dbl>      <dbl>
## 1 North Central    5627687.    5580644
## 2 Northeast        6221058.    3593222
## 3 South            7370486     4804098
## 4 West             5722755.    2798636
# Group by region and find the greatest number of citizens who walk to work
counties %>%
    group_by(region) %>%
    top_n(1, walk) %>%
    select(state, county, region, metro, population, walk, citizens)
## # A tibble: 4 x 7
## # Groups:   region [4]
##   state       county               region      metro   population  walk citizens
##   <chr>       <chr>                <chr>       <chr>        <dbl> <dbl>    <dbl>
## 1 Alaska      Aleutians East Boro~ West        Nonmet~       3304  71.2     1874
## 2 New York    New York             Northeast   Metro      1629507  20.7  1156936
## 3 North Dako~ McIntosh             North Cent~ Nonmet~       2759  17.5     2239
## 4 Virginia    Lexington city       South       Nonmet~       7071  31.7     6261
counties %>%
    group_by(region, state) %>%
    # Calculate average income
    summarize(average_income=mean(income)) %>%
    # Find the highest income state in each region
    top_n(1, average_income)
## # A tibble: 4 x 3
## # Groups:   region [4]
##   region        state        average_income
##   <chr>         <chr>                 <dbl>
## 1 North Central North Dakota         55575.
## 2 Northeast     New Jersey           73014.
## 3 South         Maryland             69200.
## 4 West          Alaska               65125.
# Count the states with more people in Metro or Nonmetro areas
counties %>%
    group_by(state, metro) %>%
    summarize(total_pop = sum(population)) %>%
    top_n(1, total_pop) %>%
    ungroup() %>%
    count(metro)
## # A tibble: 2 x 2
##   metro        n
##   <chr>    <int>
## 1 Metro       44
## 2 Nonmetro     6

Chapter 3 - Selecting and Transforming Data

Selecting:

  • Can use the colon operator(:) in select to select everything from column a to column b inclusive
    • select(a, b:d)
  • Can use contains or starts_with to select only columns whose name contains a key phrase
    • select(contains(“work”))
    • select(starts_with(“income”))
  • Can find the select helpers using ?select_helpers
    • last_col() will grab the last column
  • Can use the minus operator(-) to mean ‘do not select this column’

Renaming:

  • Renaming the columns can be done using rename()
    • rename(newName=oldName)
  • Can also run renaming inside a select statement
    • select(a, b, newName=oldName)

Transmuting:

  • Transmute is a combination of select and mutate
    • transmute(a, b, c = d / e) # final data will have only a, b, c

Example code includes:

# Glimpse the counties table
glimpse(counties)
## Observations: 3,138
## Variables: 40
## $ census_id          <chr> "1001", "1003", "1005", "1007", "1009", "1011", ...
## $ state              <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Ala...
## $ county             <chr> "Autauga", "Baldwin", "Barbour", "Bibb", "Blount...
## $ region             <chr> "South", "South", "South", "South", "South", "So...
## $ metro              <chr> "Metro", "Metro", "Nonmetro", "Metro", "Metro", ...
## $ population         <dbl> 55221, 195121, 26932, 22604, 57710, 10678, 20354...
## $ men                <dbl> 26745, 95314, 14497, 12073, 28512, 5660, 9502, 5...
## $ women              <dbl> 28476, 99807, 12435, 10531, 29198, 5018, 10852, ...
## $ hispanic           <dbl> 2.6, 4.5, 4.6, 2.2, 8.6, 4.4, 1.2, 3.5, 0.4, 1.5...
## $ white              <dbl> 75.8, 83.1, 46.2, 74.5, 87.9, 22.2, 53.3, 73.0, ...
## $ black              <dbl> 18.5, 9.5, 46.7, 21.4, 1.5, 70.7, 43.8, 20.3, 40...
## $ native             <dbl> 0.4, 0.6, 0.2, 0.4, 0.3, 1.2, 0.1, 0.2, 0.2, 0.6...
## $ asian              <dbl> 1.0, 0.7, 0.4, 0.1, 0.1, 0.2, 0.4, 0.9, 0.8, 0.3...
## $ pacific            <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...
## $ citizens           <dbl> 40725, 147695, 20714, 17495, 42345, 8057, 15581,...
## $ income             <dbl> 51281, 50254, 32964, 38678, 45813, 31938, 32229,...
## $ income_err         <dbl> 2391, 1263, 2973, 3995, 3141, 5884, 1793, 925, 2...
## $ income_per_cap     <dbl> 24974, 27317, 16824, 18431, 20532, 17580, 18390,...
## $ income_per_cap_err <dbl> 1080, 711, 798, 1618, 708, 2055, 714, 489, 1366,...
## $ poverty            <dbl> 12.9, 13.4, 26.7, 16.8, 16.7, 24.6, 25.4, 20.5, ...
## $ child_poverty      <dbl> 18.6, 19.2, 45.3, 27.9, 27.2, 38.4, 39.2, 31.6, ...
## $ professional       <dbl> 33.2, 33.1, 26.8, 21.5, 28.5, 18.8, 27.5, 27.3, ...
## $ service            <dbl> 17.0, 17.7, 16.1, 17.9, 14.1, 15.0, 16.6, 17.7, ...
## $ office             <dbl> 24.2, 27.1, 23.1, 17.8, 23.9, 19.7, 21.9, 24.2, ...
## $ construction       <dbl> 8.6, 10.8, 10.8, 19.0, 13.5, 20.1, 10.3, 10.5, 1...
## $ production         <dbl> 17.1, 11.2, 23.1, 23.7, 19.9, 26.4, 23.7, 20.4, ...
## $ drive              <dbl> 87.5, 84.7, 83.8, 83.2, 84.9, 74.9, 84.5, 85.3, ...
## $ carpool            <dbl> 8.8, 8.8, 10.9, 13.5, 11.2, 14.9, 12.4, 9.4, 11....
## $ transit            <dbl> 0.1, 0.1, 0.4, 0.5, 0.4, 0.7, 0.0, 0.2, 0.2, 0.2...
## $ walk               <dbl> 0.5, 1.0, 1.8, 0.6, 0.9, 5.0, 0.8, 1.2, 0.3, 0.6...
## $ other_transp       <dbl> 1.3, 1.4, 1.5, 1.5, 0.4, 1.7, 0.6, 1.2, 0.4, 0.7...
## $ work_at_home       <dbl> 1.8, 3.9, 1.6, 0.7, 2.3, 2.8, 1.7, 2.7, 2.1, 2.5...
## $ mean_commute       <dbl> 26.5, 26.4, 24.1, 28.8, 34.9, 27.5, 24.6, 24.1, ...
## $ employed           <dbl> 23986, 85953, 8597, 8294, 22189, 3865, 7813, 474...
## $ private_work       <dbl> 73.6, 81.5, 71.8, 76.8, 82.0, 79.5, 77.4, 74.1, ...
## $ public_work        <dbl> 20.9, 12.3, 20.8, 16.1, 13.5, 15.1, 16.2, 20.8, ...
## $ self_employed      <dbl> 5.5, 5.8, 7.3, 6.7, 4.2, 5.4, 6.2, 5.0, 2.8, 7.9...
## $ family_work        <dbl> 0.0, 0.4, 0.1, 0.4, 0.4, 0.0, 0.2, 0.1, 0.0, 0.5...
## $ unemployment       <dbl> 7.6, 7.5, 17.6, 8.3, 7.7, 18.0, 10.9, 12.3, 8.9,...
## $ land_area          <dbl> 594.44, 1589.78, 884.88, 622.58, 644.78, 622.81,...
counties %>%
    # Select state, county, population, and industry-related columns
    select(state, county, population, professional:production) %>%
    # Arrange service in descending order 
    arrange(desc(service))
## # A tibble: 3,138 x 8
##    state   county population professional service office construction production
##    <chr>   <chr>       <dbl>        <dbl>   <dbl>  <dbl>        <dbl>      <dbl>
##  1 Missis~ Tunica      10477         23.9    36.6   21.5          3.5       14.5
##  2 Texas   Kinney       3577         30      36.5   11.6         20.5        1.3
##  3 Texas   Kenedy        565         24.9    34.1   20.5         20.5        0  
##  4 New Yo~ Bronx     1428357         24.3    33.3   24.2          7.1       11  
##  5 Texas   Brooks       7221         19.6    32.4   25.3         11.1       11.5
##  6 Colora~ Fremo~      46809         26.6    32.2   22.8         10.7        7.6
##  7 Texas   Culbe~       2296         20.1    32.2   24.2         15.7        7.8
##  8 Califo~ Del N~      27788         33.9    31.5   18.8          8.9        6.8
##  9 Minnes~ Mahno~       5496         26.8    31.5   18.7         13.1        9.9
## 10 Virgin~ Lanca~      11129         30.3    31.2   22.8          8.1        7.6
## # ... with 3,128 more rows
counties %>%
    # Select the state, county, population, and those ending with "work"
    select(state, county, population, ends_with("work")) %>%
    # Filter for counties that have at least 50% of people engaged in public work
    filter(public_work >= 50)
## # A tibble: 7 x 6
##   state      county              population private_work public_work family_work
##   <chr>      <chr>                    <dbl>        <dbl>       <dbl>       <dbl>
## 1 Alaska     Lake and Peninsula~       1474         42.2        51.6         0.2
## 2 Alaska     Yukon-Koyukuk Cens~       5644         33.3        61.7         0  
## 3 California Lassen                   32645         42.6        50.5         0.1
## 4 Hawaii     Kalawao                     85         25          64.1         0  
## 5 North Dak~ Sioux                     4380         32.9        56.8         0.1
## 6 South Dak~ Todd                      9942         34.4        55           0.8
## 7 Wisconsin  Menominee                 4451         36.8        59.1         0.4
# Rename the n column to num_counties
counties %>%
    count(state) %>%
    rename(num_counties=n)
## # A tibble: 50 x 2
##    state       num_counties
##    <chr>              <int>
##  1 Alabama               67
##  2 Alaska                28
##  3 Arizona               15
##  4 Arkansas              75
##  5 California            58
##  6 Colorado              64
##  7 Connecticut            8
##  8 Delaware               3
##  9 Florida               67
## 10 Georgia              159
## # ... with 40 more rows
# Select state, county, and poverty as poverty_rate
counties %>%
    select(state, county, poverty_rate = poverty)
## # A tibble: 3,138 x 3
##    state   county   poverty_rate
##    <chr>   <chr>           <dbl>
##  1 Alabama Autauga          12.9
##  2 Alabama Baldwin          13.4
##  3 Alabama Barbour          26.7
##  4 Alabama Bibb             16.8
##  5 Alabama Blount           16.7
##  6 Alabama Bullock          24.6
##  7 Alabama Butler           25.4
##  8 Alabama Calhoun          20.5
##  9 Alabama Chambers         21.6
## 10 Alabama Cherokee         19.2
## # ... with 3,128 more rows
counties %>%
    # Keep the state, county, and populations columns, and add a density column
    transmute(state, county, population, density = population / land_area) %>%
    # Filter for counties with a population greater than one million 
    filter(population > 1000000) %>%
    # Sort density in ascending order 
    arrange(density)
## # A tibble: 41 x 4
##    state      county         population density
##    <chr>      <chr>               <dbl>   <dbl>
##  1 California San Bernardino    2094769    104.
##  2 Nevada     Clark             2035572    258.
##  3 California Riverside         2298032    319.
##  4 Arizona    Maricopa          4018143    437.
##  5 Florida    Palm Beach        1378806    700.
##  6 California San Diego         3223096    766.
##  7 Washington King              2045756    967.
##  8 Texas      Travis            1121645   1133.
##  9 Florida    Hillsborough      1302884   1277.
## 10 Florida    Orange            1229039   1360.
## # ... with 31 more rows
# Change the name of the unemployment column
counties %>%
    rename(unemployment_rate = unemployment)
## # A tibble: 3,138 x 40
##    census_id state county region metro population   men women hispanic white
##    <chr>     <chr> <chr>  <chr>  <chr>      <dbl> <dbl> <dbl>    <dbl> <dbl>
##  1 1001      Alab~ Autau~ South  Metro      55221 26745 28476      2.6  75.8
##  2 1003      Alab~ Baldw~ South  Metro     195121 95314 99807      4.5  83.1
##  3 1005      Alab~ Barbo~ South  Nonm~      26932 14497 12435      4.6  46.2
##  4 1007      Alab~ Bibb   South  Metro      22604 12073 10531      2.2  74.5
##  5 1009      Alab~ Blount South  Metro      57710 28512 29198      8.6  87.9
##  6 1011      Alab~ Bullo~ South  Nonm~      10678  5660  5018      4.4  22.2
##  7 1013      Alab~ Butler South  Nonm~      20354  9502 10852      1.2  53.3
##  8 1015      Alab~ Calho~ South  Metro     116648 56274 60374      3.5  73  
##  9 1017      Alab~ Chamb~ South  Nonm~      34079 16258 17821      0.4  57.3
## 10 1019      Alab~ Chero~ South  Nonm~      26008 12975 13033      1.5  91.7
## # ... with 3,128 more rows, and 30 more variables: black <dbl>, native <dbl>,
## #   asian <dbl>, pacific <dbl>, citizens <dbl>, income <dbl>, income_err <dbl>,
## #   income_per_cap <dbl>, income_per_cap_err <dbl>, poverty <dbl>,
## #   child_poverty <dbl>, professional <dbl>, service <dbl>, office <dbl>,
## #   construction <dbl>, production <dbl>, drive <dbl>, carpool <dbl>,
## #   transit <dbl>, walk <dbl>, other_transp <dbl>, work_at_home <dbl>,
## #   mean_commute <dbl>, employed <dbl>, private_work <dbl>, public_work <dbl>,
## #   self_employed <dbl>, family_work <dbl>, unemployment_rate <dbl>,
## #   land_area <dbl>
# Keep the state and county columns, and the columns containing poverty
counties %>%
    select(state, county, contains("poverty"))
## # A tibble: 3,138 x 4
##    state   county   poverty child_poverty
##    <chr>   <chr>      <dbl>         <dbl>
##  1 Alabama Autauga     12.9          18.6
##  2 Alabama Baldwin     13.4          19.2
##  3 Alabama Barbour     26.7          45.3
##  4 Alabama Bibb        16.8          27.9
##  5 Alabama Blount      16.7          27.2
##  6 Alabama Bullock     24.6          38.4
##  7 Alabama Butler      25.4          39.2
##  8 Alabama Calhoun     20.5          31.6
##  9 Alabama Chambers    21.6          37.2
## 10 Alabama Cherokee    19.2          30.1
## # ... with 3,128 more rows
# Calculate the fraction_women column without dropping the other columns
counties %>%
    mutate(fraction_women = women / population)
## # A tibble: 3,138 x 41
##    census_id state county region metro population   men women hispanic white
##    <chr>     <chr> <chr>  <chr>  <chr>      <dbl> <dbl> <dbl>    <dbl> <dbl>
##  1 1001      Alab~ Autau~ South  Metro      55221 26745 28476      2.6  75.8
##  2 1003      Alab~ Baldw~ South  Metro     195121 95314 99807      4.5  83.1
##  3 1005      Alab~ Barbo~ South  Nonm~      26932 14497 12435      4.6  46.2
##  4 1007      Alab~ Bibb   South  Metro      22604 12073 10531      2.2  74.5
##  5 1009      Alab~ Blount South  Metro      57710 28512 29198      8.6  87.9
##  6 1011      Alab~ Bullo~ South  Nonm~      10678  5660  5018      4.4  22.2
##  7 1013      Alab~ Butler South  Nonm~      20354  9502 10852      1.2  53.3
##  8 1015      Alab~ Calho~ South  Metro     116648 56274 60374      3.5  73  
##  9 1017      Alab~ Chamb~ South  Nonm~      34079 16258 17821      0.4  57.3
## 10 1019      Alab~ Chero~ South  Nonm~      26008 12975 13033      1.5  91.7
## # ... with 3,128 more rows, and 31 more variables: black <dbl>, native <dbl>,
## #   asian <dbl>, pacific <dbl>, citizens <dbl>, income <dbl>, income_err <dbl>,
## #   income_per_cap <dbl>, income_per_cap_err <dbl>, poverty <dbl>,
## #   child_poverty <dbl>, professional <dbl>, service <dbl>, office <dbl>,
## #   construction <dbl>, production <dbl>, drive <dbl>, carpool <dbl>,
## #   transit <dbl>, walk <dbl>, other_transp <dbl>, work_at_home <dbl>,
## #   mean_commute <dbl>, employed <dbl>, private_work <dbl>, public_work <dbl>,
## #   self_employed <dbl>, family_work <dbl>, unemployment <dbl>,
## #   land_area <dbl>, fraction_women <dbl>
# Keep only the state, county, and employment_rate columns
counties %>%
    transmute(state, county, employment_rate = employed / population)
## # A tibble: 3,138 x 3
##    state   county   employment_rate
##    <chr>   <chr>              <dbl>
##  1 Alabama Autauga            0.434
##  2 Alabama Baldwin            0.441
##  3 Alabama Barbour            0.319
##  4 Alabama Bibb               0.367
##  5 Alabama Blount             0.384
##  6 Alabama Bullock            0.362
##  7 Alabama Butler             0.384
##  8 Alabama Calhoun            0.406
##  9 Alabama Chambers           0.402
## 10 Alabama Cherokee           0.390
## # ... with 3,128 more rows

Chapter 4 - Case Study

The babynames dataset:

  • The babynames data includes the names of babies born in the US by year
    • Columns are year-name-number
  • Can use ggplot2 to make line plots of given names
    • babynames %>% filter(name=“myName”) %>% ggplot(aes(x=year, y=number)) + geom_line()

Grouped Mutates:

  • Can run a grouped mutate, which will create the same calculation for every member of the group
    • myDF %>% group_by(a) %>% mutate(d_total=sum(d), d_pct=d/d_total) %>% ungroup()

Window Functions:

  • Window functions include lag() or lead()
    • v <- c(1, 3, 6, 14)
    • lag(v) # c(NA, 1, 3, 6)
    • v - lag(v) # c(NA, 2, 3, 8)

Wrap Up:

  • Transforming data using dplyr - select, filter, mutate, arrange
  • Summarizing with count() and group_by()
  • Renaming with rename() and transmute()
  • Plotting data and running window functions

Example code includes:

babynames %>%
    # Filter for the year 1990
    filter(year==1990) %>%
    # Sort the number column in descending order 
    arrange(desc(number))
## # A tibble: 21,223 x 3
##     year name        number
##    <dbl> <chr>        <int>
##  1  1990 Michael      65560
##  2  1990 Christopher  52520
##  3  1990 Jessica      46615
##  4  1990 Ashley       45797
##  5  1990 Matthew      44925
##  6  1990 Joshua       43382
##  7  1990 Brittany     36650
##  8  1990 Amanda       34504
##  9  1990 Daniel       33963
## 10  1990 David        33862
## # ... with 21,213 more rows
# Find the most common name in each year
babynames %>%
    group_by(year) %>%
    top_n(1, number)
## # A tibble: 28 x 3
## # Groups:   year [28]
##     year name  number
##    <dbl> <chr>  <int>
##  1  1880 John    9701
##  2  1885 Mary    9166
##  3  1890 Mary   12113
##  4  1895 Mary   13493
##  5  1900 Mary   16781
##  6  1905 Mary   16135
##  7  1910 Mary   22947
##  8  1915 Mary   58346
##  9  1920 Mary   71175
## 10  1925 Mary   70857
## # ... with 18 more rows
# Filter for the names Steven, Thomas, and Matthew 
selected_names <- babynames %>%
    filter(name %in% c("Steven", "Thomas", "Matthew"))

# Plot the names using a different color for each name
ggplot(selected_names, aes(x = year, y = number, color = name)) +
    geom_line()

# Find the year each name is most common 
babynames %>%
    group_by(year) %>%
    mutate(year_total=sum(number)) %>%
    ungroup() %>%
    mutate(fraction = number / year_total) %>%
    group_by(name) %>%
    top_n(1, fraction)
## # A tibble: 48,040 x 5
## # Groups:   name [48,040]
##     year name      number year_total  fraction
##    <dbl> <chr>      <int>      <int>     <dbl>
##  1  1880 Abbott         5     201478 0.0000248
##  2  1880 Abe           50     201478 0.000248 
##  3  1880 Abner         27     201478 0.000134 
##  4  1880 Adelbert      28     201478 0.000139 
##  5  1880 Adella        26     201478 0.000129 
##  6  1880 Adolf          6     201478 0.0000298
##  7  1880 Adolph        93     201478 0.000462 
##  8  1880 Agustus        5     201478 0.0000248
##  9  1880 Albert      1493     201478 0.00741  
## 10  1880 Albertina      7     201478 0.0000347
## # ... with 48,030 more rows
names_normalized <- babynames %>%
    group_by(name) %>%
    mutate(name_total = sum(number), name_max = max(number)) %>%
    # Ungroup the table 
    ungroup() %>%
    # Add the fraction_max column containing the number by the name maximum 
    mutate(fraction_max = number / name_max)
names_normalized
## # A tibble: 332,595 x 6
##     year name    number name_total name_max fraction_max
##    <dbl> <chr>    <int>      <int>    <int>        <dbl>
##  1  1880 Aaron      102     114739    14635     0.00697 
##  2  1880 Ab           5         77       31     0.161   
##  3  1880 Abbie       71       4330      445     0.160   
##  4  1880 Abbott       5        217       51     0.0980  
##  5  1880 Abby         6      11272     1753     0.00342 
##  6  1880 Abe         50       1832      271     0.185   
##  7  1880 Abel         9      10565     3245     0.00277 
##  8  1880 Abigail     12      72600    15762     0.000761
##  9  1880 Abner       27       1552      199     0.136   
## 10  1880 Abraham     81      17882     2449     0.0331  
## # ... with 332,585 more rows
# Filter for the names Steven, Thomas, and Matthew
names_filtered <- names_normalized %>%
    filter(name %in% c("Steven", "Thomas", "Matthew"))

# Visualize these names over time
ggplot(names_filtered, aes(x=year, y=fraction_max, color=name)) + 
    geom_line()

# Find the year each name is most common 
babynames_fraction <- babynames %>%
    group_by(year) %>%
    mutate(year_total=sum(number)) %>%
    ungroup() %>%
    mutate(fraction = number / year_total)
babynames_fraction
## # A tibble: 332,595 x 5
##     year name    number year_total  fraction
##    <dbl> <chr>    <int>      <int>     <dbl>
##  1  1880 Aaron      102     201478 0.000506 
##  2  1880 Ab           5     201478 0.0000248
##  3  1880 Abbie       71     201478 0.000352 
##  4  1880 Abbott       5     201478 0.0000248
##  5  1880 Abby         6     201478 0.0000298
##  6  1880 Abe         50     201478 0.000248 
##  7  1880 Abel         9     201478 0.0000447
##  8  1880 Abigail     12     201478 0.0000596
##  9  1880 Abner       27     201478 0.000134 
## 10  1880 Abraham     81     201478 0.000402 
## # ... with 332,585 more rows
babynames_fraction %>%
    # Arrange the data in order of name, then year 
    arrange(name, year) %>%
    # Group the data by name
    group_by(name) %>%
    # Add a ratio column that contains the ratio between each year 
    mutate(ratio = fraction / lag(fraction))
## # A tibble: 332,595 x 6
## # Groups:   name [48,040]
##     year name    number year_total   fraction  ratio
##    <dbl> <chr>    <int>      <int>      <dbl>  <dbl>
##  1  2010 Aaban        9    3672066 0.00000245 NA    
##  2  2015 Aaban       15    3648781 0.00000411  1.68 
##  3  1995 Aadam        6    3652750 0.00000164 NA    
##  4  2000 Aadam        6    3767293 0.00000159  0.970
##  5  2005 Aadam        6    3828460 0.00000157  0.984
##  6  2010 Aadam        7    3672066 0.00000191  1.22 
##  7  2015 Aadam       22    3648781 0.00000603  3.16 
##  8  2010 Aadan       11    3672066 0.00000300 NA    
##  9  2015 Aadan       10    3648781 0.00000274  0.915
## 10  2000 Aadarsh      5    3767293 0.00000133 NA    
## # ... with 332,585 more rows
babynames_ratios_filtered <- babynames_fraction %>%
    arrange(name, year) %>%
    group_by(name) %>%
    mutate(ratio = fraction / lag(fraction)) %>%
    filter(fraction >= 0.00001)
babynames_ratios_filtered
## # A tibble: 104,344 x 6
## # Groups:   name [14,807]
##     year name    number year_total  fraction  ratio
##    <dbl> <chr>    <int>      <int>     <dbl>  <dbl>
##  1  2010 Aaden      450    3672066 0.000123  14.2  
##  2  2015 Aaden      297    3648781 0.0000814  0.664
##  3  2015 Aadhya     265    3648781 0.0000726 14.0  
##  4  2005 Aadi        51    3828460 0.0000133 NA    
##  5  2010 Aadi        54    3672066 0.0000147  1.10 
##  6  2015 Aadi        43    3648781 0.0000118  0.801
##  7  2010 Aaditya     37    3672066 0.0000101  1.48 
##  8  2015 Aadya      159    3648781 0.0000436  4.85 
##  9  2010 Aadyn       38    3672066 0.0000103  3.60 
## 10  2010 Aahana      64    3672066 0.0000174  5.13 
## # ... with 104,334 more rows
babynames_ratios_filtered %>%
    # Extract the largest ratio from each name 
    top_n(1, ratio) %>%
    # Sort the ratio column in descending order 
    arrange(desc(ratio)) %>%
    # Filter for fractions greater than or equal to 0.001
    filter(fraction >= 0.001)
## # A tibble: 291 x 6
## # Groups:   name [291]
##     year name    number year_total fraction ratio
##    <dbl> <chr>    <int>      <int>    <dbl> <dbl>
##  1  1960 Tammy    14365    4152075  0.00346  70.1
##  2  2005 Nevaeh    4610    3828460  0.00120  45.8
##  3  1940 Brenda    5460    2301630  0.00237  37.5
##  4  1885 Grover     774     240822  0.00321  36.0
##  5  1945 Cheryl    8170    2652029  0.00308  24.9
##  6  1955 Lori      4980    4012691  0.00124  23.2
##  7  2010 Khloe     5411    3672066  0.00147  23.2
##  8  1950 Debra     6189    3502592  0.00177  22.6
##  9  2010 Bentley   4001    3672066  0.00109  22.4
## 10  1935 Marlene   4840    2088487  0.00232  16.8
## # ... with 281 more rows

Introduction to Function Writing in R

Chapter 1 - How to Write a Function

Rationale for Using Functions:

  • Arguments can be passed to function by position, by name, or by both
    • mean(numbers, 0.1, TRUE)
    • mean(x=numbers, trim=0.1, na.rm=TRUE)
    • Typical best practice is to name rare arguments and just pass common arguments - so, mean(numbers, trim=0.1, na.rm=TRUE)
  • Functions reduce the amount of crode writing which also reduces the risk for errors/typos

Converting Scripts in to Functions:

  • A typical workflow is to write a script, then convert repeating portions of the script to a function
    • myFun <- function(arg1, arg2, …) { … }
    • The final result is returned by the function by default

Code Readability:

  • Functions can be thought of as verbs, with dplyr functions being a classic example
    • This is in contrast to many variables, which can be thought of as nouns (representing objects)
  • Functions should be well-named to describe what they do, contain a verb, and be specific
  • There are some trade-offs between readability and typing time, though typically more time is spent reading code than writing code
    • Auto-complete further reduces the amount of typing time, even for long variable names
    • Can also alias any commonly used functions
  • Arguments should be placed in a sensible order, though legacy functions such as lm() do not always follow these practices
    • Data arguments should come first
    • Detail arguments should come second

Example code includes:

gold_medals <- c(46, 27, 26, 19, 17, 12, 10, 9, 8, 8, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 4, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA)
names(gold_medals) <- c('USA', 'GBR', 'CHN', 'RUS', 'GER', 'JPN', 'FRA', 'KOR', 'ITA', 'AUS', 'NED', 'HUN', 'BRA', 'ESP', 'KEN', 'JAM', 'CRO', 'CUB', 'NZL', 'CAN', 'UZB', 'KAZ', 'COL', 'SUI', 'IRI', 'GRE', 'ARG', 'DEN', 'SWE', 'RSA', 'UKR', 'SRB', 'POL', 'PRK', 'BEL', 'THA', 'SVK', 'GEO', 'AZE', 'BLR', 'TUR', 'ARM', 'CZE', 'ETH', 'SLO', 'INA', 'ROU', 'BRN', 'VIE', 'TPE', 'BAH', 'IOA', 'CIV', 'FIJ', 'JOR', 'KOS', 'PUR', 'SIN', 'TJK', 'MAS', 'MEX', 'VEN', 'ALG', 'IRL', 'LTU', 'BUL', 'IND', 'MGL', 'BDI', 'GRN', 'NIG', 'PHI', 'QAT', 'NOR', 'EGY', 'TUN', 'ISR', 'AUT', 'DOM', 'EST', 'FIN', 'MAR', 'NGR', 'POR', 'TTO', 'UAE', 'IOC')


# Look at the gold medals data
gold_medals
## USA GBR CHN RUS GER JPN FRA KOR ITA AUS NED HUN BRA ESP KEN JAM CRO CUB NZL CAN 
##  46  27  26  19  17  12  10   9   8   8   8   8   7   7   6   6   5   5   4   4 
## UZB KAZ COL SUI IRI GRE ARG DEN SWE RSA UKR SRB POL PRK BEL THA SVK GEO AZE BLR 
##   4   3   3   3   3   3   3   2   2   2   2   2   2   2   2   2   2   2   1   1 
## TUR ARM CZE ETH SLO INA ROU BRN VIE TPE BAH IOA CIV FIJ JOR KOS PUR SIN TJK MAS 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   0 
## MEX VEN ALG IRL LTU BUL IND MGL BDI GRN NIG PHI QAT NOR EGY TUN ISR AUT DOM EST 
##   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 
## FIN MAR NGR POR TTO UAE IOC 
##   0   0   0   0   0   0  NA
# Note the arguments to median()
args(median)
## function (x, na.rm = FALSE, ...) 
## NULL
# Rewrite this function call, following best practices
median(gold_medals, na.rm=TRUE)
## [1] 1
# Note the arguments to rank()
args(rank)
## function (x, na.last = TRUE, ties.method = c("average", "first", 
##     "last", "random", "max", "min")) 
## NULL
# Rewrite this function call, following best practices
rank(-gold_medals, na.last="keep", ties.method = "min")
## USA GBR CHN RUS GER JPN FRA KOR ITA AUS NED HUN BRA ESP KEN JAM CRO CUB NZL CAN 
##   1   2   3   4   5   6   7   8   9   9   9   9  13  13  15  15  17  17  19  19 
## UZB KAZ COL SUI IRI GRE ARG DEN SWE RSA UKR SRB POL PRK BEL THA SVK GEO AZE BLR 
##  19  22  22  22  22  22  22  28  28  28  28  28  28  28  28  28  28  28  39  39 
## TUR ARM CZE ETH SLO INA ROU BRN VIE TPE BAH IOA CIV FIJ JOR KOS PUR SIN TJK MAS 
##  39  39  39  39  39  39  39  39  39  39  39  39  39  39  39  39  39  39  39  60 
## MEX VEN ALG IRL LTU BUL IND MGL BDI GRN NIG PHI QAT NOR EGY TUN ISR AUT DOM EST 
##  60  60  60  60  60  60  60  60  60  60  60  60  60  60  60  60  60  60  60  60 
## FIN MAR NGR POR TTO UAE IOC 
##  60  60  60  60  60  60  NA
coin_sides <- c("head", "tail")

# Sample from coin_sides once
sample(coin_sides, 1)
## [1] "head"
# Your functions, from previous steps
toss_coin <- function() {
    coin_sides <- c("head", "tail")
    sample(coin_sides, 1)
}

# Call your function
toss_coin()
## [1] "tail"
# Update the function to return n coin tosses
toss_coin <- function(n_flips) {
    coin_sides <- c("head", "tail")
    sample(coin_sides, n_flips, replace=TRUE)
}

# Generate 10 coin tosses
toss_coin(10)
##  [1] "head" "tail" "head" "head" "head" "tail" "head" "tail" "tail" "head"
# Update the function so heads have probability p_head
toss_coin <- function(n_flips, p_head) {
    coin_sides <- c("head", "tail")
    # Define a vector of weights
    weights <- c(p_head, 1-p_head)
    # Modify the sampling to be weighted
    sample(coin_sides, n_flips, replace = TRUE, prob=weights)
}

# Generate 10 coin tosses
toss_coin(10, p_head=0.8)
##  [1] "head" "head" "head" "head" "head" "head" "head" "head" "head" "head"
snake_river_visits <- readRDS("./RInputFiles/snake_river_visits.rds")
str(snake_river_visits)
## 'data.frame':    410 obs. of  4 variables:
##  $ n_visits: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ gender  : Factor w/ 2 levels "male","female": 1 1 1 2 1 2 2 2 1 1 ...
##  $ income  : Factor w/ 4 levels "[$0,$25k]","($25k,$55k]",..: 4 2 4 2 4 2 4 4 4 4 ...
##  $ travel  : Factor w/ 3 levels "[0h,0.25h]","(0.25h,4h]",..: NA NA NA NA NA NA NA NA NA NA ...
# Run a generalized linear regression 
glm(
    # Model no. of visits vs. gender, income, travel
    n_visits ~ gender + income + travel, 
    # Use the snake_river_visits dataset
    data = snake_river_visits, 
    # Make it a Poisson regression
    family = poisson
)
## 
## Call:  glm(formula = n_visits ~ gender + income + travel, family = poisson, 
##     data = snake_river_visits)
## 
## Coefficients:
##       (Intercept)       genderfemale  income($25k,$55k]  income($55k,$95k]  
##            4.0864             0.3740            -0.0199            -0.5807  
## income($95k,$Inf)   travel(0.25h,4h]    travel(4h,Infh)  
##           -0.5782            -0.6271            -2.4230  
## 
## Degrees of Freedom: 345 Total (i.e. Null);  339 Residual
##   (64 observations deleted due to missingness)
## Null Deviance:       18850 
## Residual Deviance: 11530     AIC: 12860
# From previous step
run_poisson_regression <- function(data, formula) {
    glm(formula, data, family = poisson)
}

# Re-run the Poisson regression, using your function
model <- snake_river_visits %>%
    run_poisson_regression(n_visits ~ gender + income + travel)


icLevels <- c("[$0,$25k]", "($25k,$55k]", "($55k,$95k]", "($95k,$Inf)")
trLevels <- c("[0h,0.25h]", "(0.25h,4h]", "(4h,Infh)")
srGender <- c("male", "female")[c(1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2)]
srIncome <- icLevels[c(1, 1, 2, 2, 3, 3, 4, 4, 1, 1, 2, 2, 3, 3, 4, 4, 1, 1, 2, 2, 3, 3, 4, 4)]
srTravel <- trLevels[c(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3)]
snake_river_explanatory <- data.frame(gender=factor(srGender, levels=c("male", "female")), 
                                      income=factor(srIncome, levels=icLevels), 
                                      travel=factor(srTravel, levels=trLevels)
                                      )
str(snake_river_explanatory)
## 'data.frame':    24 obs. of  3 variables:
##  $ gender: Factor w/ 2 levels "male","female": 1 2 1 2 1 2 1 2 1 2 ...
##  $ income: Factor w/ 4 levels "[$0,$25k]","($25k,$55k]",..: 1 1 2 2 3 3 4 4 1 1 ...
##  $ travel: Factor w/ 3 levels "[0h,0.25h]","(0.25h,4h]",..: 1 1 1 1 1 1 1 1 2 2 ...
# Run this to see the predictions
snake_river_explanatory %>%
    mutate(predicted_n_visits = predict(model, ., type = "response"))%>%
    arrange(desc(predicted_n_visits))
##    gender      income     travel predicted_n_visits
## 1  female   [$0,$25k] [0h,0.25h]          86.518598
## 2  female ($25k,$55k] [0h,0.25h]          84.813684
## 3    male   [$0,$25k] [0h,0.25h]          59.524843
## 4    male ($25k,$55k] [0h,0.25h]          58.351861
## 5  female ($95k,$Inf) [0h,0.25h]          48.526883
## 6  female ($55k,$95k] [0h,0.25h]          48.408009
## 7  female   [$0,$25k] (0.25h,4h]          46.212343
## 8  female ($25k,$55k] (0.25h,4h]          45.301694
## 9    male ($95k,$Inf) [0h,0.25h]          33.386522
## 10   male ($55k,$95k] [0h,0.25h]          33.304737
## 11   male   [$0,$25k] (0.25h,4h]          31.794117
## 12   male ($25k,$55k] (0.25h,4h]          31.167590
## 13 female ($95k,$Inf) (0.25h,4h]          25.919756
## 14 female ($55k,$95k] (0.25h,4h]          25.856261
## 15   male ($95k,$Inf) (0.25h,4h]          17.832806
## 16   male ($55k,$95k] (0.25h,4h]          17.789122
## 17 female   [$0,$25k]  (4h,Infh)           7.670599
## 18 female ($25k,$55k]  (4h,Infh)           7.519444
## 19   male   [$0,$25k]  (4h,Infh)           5.277376
## 20   male ($25k,$55k]  (4h,Infh)           5.173382
## 21 female ($95k,$Inf)  (4h,Infh)           4.302315
## 22 female ($55k,$95k]  (4h,Infh)           4.291776
## 23   male ($95k,$Inf)  (4h,Infh)           2.959995
## 24   male ($55k,$95k]  (4h,Infh)           2.952744

Chapter 2 - Arguments

Default Arguments:

  • Often, it is helpful to have a default argument when users will rarely want to deviate from that
    • toss_coin <- function(n_flips, p_head=0.5) { … }
  • Can also set defaults to other arguments
    • myFun <- function(a, b=TRUE, c=b, d=c) { … }
  • There are two special types of defaults
    • NULL typically drives special handling (see the documentation)
    • Categorical defaults can be set by passing a character vector in the function arguments and then calling match.arg() in the function body
    • myFun <- function(a, b=NULL, c=c(“d”, “e”)) # note that c=“d” will be the default if not passed, while an error is generated if something other than “d” or “e” is passed
  • Cutting is the process of converting a numerical variable to a categorical variable (e.g., “0-6”, “7-9”, etc.)

Passing Arguments Between Functions:

  • Can pass arguments from one function to another by placing them in the called function
  • The ellipsis … argument allows for simplifying code, meaning ‘accept any other arguments and pass them’
    • In addition to allow for less typing, it avoids the need to re-write the main function any time the sub-functions update
    • This is a double-edged sword, as the above behavior depends on both trust and the need for stability

Checking Arguments:

  • Mistakes by the function writer are referred to as “bugs”
  • Can modify functions to throw their own error messages when inputs are of the wrong type, format, length, etc.
    • if (!is.numeric(x)) { stop(“Need to provide a numeric for x”) }
  • Can instead use the assertive package which provides clear messages when an assertion fails
    • assert_is_numeric(x)
    • assert_all_are_positive(x)
    • The functions is_numeric() and is_positive() are part of the overarching assert_* functions
  • Can also fix inputs if they are otherwise non-meaningful
    • use_first() will take only the first argument from a longer element
    • coerce_to(x, “character”) will coerce to character

Example code includes:

n_visits <- snake_river_visits$n_visits
summary(n_visits)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    1.00    4.00   24.99   30.00  350.00
# Set the default for n to 5
cut_by_quantile <- function(x, n=5, na.rm, labels, interval_type) {
    probs <- seq(0, 1, length.out = n + 1)
    qtiles <- quantile(x, probs, na.rm = na.rm, names = FALSE)
    right <- switch(interval_type, "(lo, hi]" = TRUE, "[lo, hi)" = FALSE)
    cut(x, qtiles, labels = labels, right = right, include.lowest = TRUE)
}

# Remove the n argument from the call
cut_by_quantile(n_visits, na.rm = FALSE, 
                labels = c("very low", "low", "medium", "high", "very high"), interval_type = "(lo, hi]"
                )
##   [1] very low  very low  very low  very low  very low  very low  very low 
##   [8] very low  very low  very low  very low  very low  very low  very low 
##  [15] very low  very low  very low  very low  very low  high      very high
##  [22] high      very low  medium    low       very low  very low  very low 
##  [29] very low  very low  very low  very high very high very high very high
##  [36] very high high      very high very high very high very high very high
##  [43] medium    very high very high very high medium    medium    low      
##  [50] high      high      high      very high very high high      high     
##  [57] very high medium    very high high      medium    high      very high
##  [64] very high very high very high high      high      very high high     
##  [71] very low  very high high      high      medium    high      high     
##  [78] high      medium    very high very high very high high      high     
##  [85] high      very low  very high medium    high      very high high     
##  [92] high      very high high      very low  very low  medium    very low 
##  [99] medium    medium    very high medium    medium    medium    high     
## [106] low       high      very high medium    very high medium    very high
## [113] low       very high low       very high high      very low  very low 
## [120] very low  very low  low       very low  very low  very low  very low 
## [127] very low  very low  medium    very low  very low  low       low      
## [134] very low  very low  low       very low  very low  very low  low      
## [141] low       medium    medium    medium    medium    medium    very low 
## [148] very low  low       very low  low       medium    very low  very low 
## [155] very low  very low  very high high      very high high      medium   
## [162] very high medium    very low  high      medium    high      high     
## [169] very high high      high      very high very high high      very high
## [176] high      high      medium    very high high      high      high     
## [183] very high very high very low  high      very high high      high     
## [190] medium    very high high      very high high      very high high     
## [197] very high high      very high very low  high      very high very high
## [204] very low  very low  medium    very high medium    low       medium   
## [211] high      medium    very low  medium    very high high      very high
## [218] high      very high high      low       high      medium    very high
## [225] medium    high      high      high      very low  high      high     
## [232] high      very high high      medium    medium    very low  very low 
## [239] very low  very low  medium    low       very low  very low  very low 
## [246] medium    high      very low  very low  medium    very low  very low 
## [253] very low  very low  very low  very low  very low  very low  very low 
## [260] very low  very high medium    very low  very high medium    very high
## [267] medium    low       very high medium    medium    medium    low      
## [274] high      medium    high      very high medium    very high very high
## [281] medium    medium    very high high      medium    very high high     
## [288] medium    low       very low  medium    very low  very low  very low 
## [295] very low  very low  low       very low  very low  very low  very low 
## [302] very low  very low  very low  very low  low       very low  very low 
## [309] very low  very low  low       very low  very low  low       very low 
## [316] very low  very low  very low  low       very low  very low  very low 
## [323] very low  very low  low       very low  very low  very low  very low 
## [330] very low  very low  very low  very low  very low  very low  very low 
## [337] very low  very low  very low  very low  very low  very low  very low 
## [344] very low  very low  medium    very low  very low  very low  very low 
## [351] very low  very low  very low  very low  very low  very low  very low 
## [358] very low  low       very low  very low  very low  very low  very low 
## [365] very low  very low  very low  very low  very low  very low  low      
## [372] very low  very low  very low  very high high      very high very high
## [379] very high high      very high very high very high very high medium   
## [386] medium    medium    high      very high high      high      high     
## [393] high      high      high      high      very high high      very high
## [400] medium    high      low       high      very high low       very low 
## [407] medium    very low  medium    low      
## Levels: very low low medium high very high
# Set the default for na.rm to FALSE
cut_by_quantile <- function(x, n = 5, na.rm=FALSE, labels, interval_type) {
    probs <- seq(0, 1, length.out = n + 1)
    qtiles <- quantile(x, probs, na.rm = na.rm, names = FALSE)
    right <- switch(interval_type, "(lo, hi]" = TRUE, "[lo, hi)" = FALSE)
    cut(x, qtiles, labels = labels, right = right, include.lowest = TRUE)
}

# Remove the na.rm argument from the call
cut_by_quantile(n_visits, labels = c("very low", "low", "medium", "high", "very high"), 
                interval_type = "(lo, hi]"
                )
##   [1] very low  very low  very low  very low  very low  very low  very low 
##   [8] very low  very low  very low  very low  very low  very low  very low 
##  [15] very low  very low  very low  very low  very low  high      very high
##  [22] high      very low  medium    low       very low  very low  very low 
##  [29] very low  very low  very low  very high very high very high very high
##  [36] very high high      very high very high very high very high very high
##  [43] medium    very high very high very high medium    medium    low      
##  [50] high      high      high      very high very high high      high     
##  [57] very high medium    very high high      medium    high      very high
##  [64] very high very high very high high      high      very high high     
##  [71] very low  very high high      high      medium    high      high     
##  [78] high      medium    very high very high very high high      high     
##  [85] high      very low  very high medium    high      very high high     
##  [92] high      very high high      very low  very low  medium    very low 
##  [99] medium    medium    very high medium    medium    medium    high     
## [106] low       high      very high medium    very high medium    very high
## [113] low       very high low       very high high      very low  very low 
## [120] very low  very low  low       very low  very low  very low  very low 
## [127] very low  very low  medium    very low  very low  low       low      
## [134] very low  very low  low       very low  very low  very low  low      
## [141] low       medium    medium    medium    medium    medium    very low 
## [148] very low  low       very low  low       medium    very low  very low 
## [155] very low  very low  very high high      very high high      medium   
## [162] very high medium    very low  high      medium    high      high     
## [169] very high high      high      very high very high high      very high
## [176] high      high      medium    very high high      high      high     
## [183] very high very high very low  high      very high high      high     
## [190] medium    very high high      very high high      very high high     
## [197] very high high      very high very low  high      very high very high
## [204] very low  very low  medium    very high medium    low       medium   
## [211] high      medium    very low  medium    very high high      very high
## [218] high      very high high      low       high      medium    very high
## [225] medium    high      high      high      very low  high      high     
## [232] high      very high high      medium    medium    very low  very low 
## [239] very low  very low  medium    low       very low  very low  very low 
## [246] medium    high      very low  very low  medium    very low  very low 
## [253] very low  very low  very low  very low  very low  very low  very low 
## [260] very low  very high medium    very low  very high medium    very high
## [267] medium    low       very high medium    medium    medium    low      
## [274] high      medium    high      very high medium    very high very high
## [281] medium    medium    very high high      medium    very high high     
## [288] medium    low       very low  medium    very low  very low  very low 
## [295] very low  very low  low       very low  very low  very low  very low 
## [302] very low  very low  very low  very low  low       very low  very low 
## [309] very low  very low  low       very low  very low  low       very low 
## [316] very low  very low  very low  low       very low  very low  very low 
## [323] very low  very low  low       very low  very low  very low  very low 
## [330] very low  very low  very low  very low  very low  very low  very low 
## [337] very low  very low  very low  very low  very low  very low  very low 
## [344] very low  very low  medium    very low  very low  very low  very low 
## [351] very low  very low  very low  very low  very low  very low  very low 
## [358] very low  low       very low  very low  very low  very low  very low 
## [365] very low  very low  very low  very low  very low  very low  low      
## [372] very low  very low  very low  very high high      very high very high
## [379] very high high      very high very high very high very high medium   
## [386] medium    medium    high      very high high      high      high     
## [393] high      high      high      high      very high high      very high
## [400] medium    high      low       high      very high low       very low 
## [407] medium    very low  medium    low      
## Levels: very low low medium high very high
# Set the default for labels to NULL
cut_by_quantile <- function(x, n = 5, na.rm = FALSE, labels=NULL, interval_type) {
    probs <- seq(0, 1, length.out = n + 1)
    qtiles <- quantile(x, probs, na.rm = na.rm, names = FALSE)
    right <- switch(interval_type, "(lo, hi]" = TRUE, "[lo, hi)" = FALSE)
    cut(x, qtiles, labels = labels, right = right, include.lowest = TRUE)
}

# Remove the labels argument from the call
cut_by_quantile(n_visits, interval_type = "(lo, hi]")
##   [1] [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]   
##   [9] [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]   
##  [17] [0,1]    [0,1]    [0,1]    (10,35]  (35,350] (10,35]  [0,1]    (2,10]  
##  [25] (1,2]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    (35,350]
##  [33] (35,350] (35,350] (35,350] (35,350] (10,35]  (35,350] (35,350] (35,350]
##  [41] (35,350] (35,350] (2,10]   (35,350] (35,350] (35,350] (2,10]   (2,10]  
##  [49] (1,2]    (10,35]  (10,35]  (10,35]  (35,350] (35,350] (10,35]  (10,35] 
##  [57] (35,350] (2,10]   (35,350] (10,35]  (2,10]   (10,35]  (35,350] (35,350]
##  [65] (35,350] (35,350] (10,35]  (10,35]  (35,350] (10,35]  [0,1]    (35,350]
##  [73] (10,35]  (10,35]  (2,10]   (10,35]  (10,35]  (10,35]  (2,10]   (35,350]
##  [81] (35,350] (35,350] (10,35]  (10,35]  (10,35]  [0,1]    (35,350] (2,10]  
##  [89] (10,35]  (35,350] (10,35]  (10,35]  (35,350] (10,35]  [0,1]    [0,1]   
##  [97] (2,10]   [0,1]    (2,10]   (2,10]   (35,350] (2,10]   (2,10]   (2,10]  
## [105] (10,35]  (1,2]    (10,35]  (35,350] (2,10]   (35,350] (2,10]   (35,350]
## [113] (1,2]    (35,350] (1,2]    (35,350] (10,35]  [0,1]    [0,1]    [0,1]   
## [121] [0,1]    (1,2]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]   
## [129] (2,10]   [0,1]    [0,1]    (1,2]    (1,2]    [0,1]    [0,1]    (1,2]   
## [137] [0,1]    [0,1]    [0,1]    (1,2]    (1,2]    (2,10]   (2,10]   (2,10]  
## [145] (2,10]   (2,10]   [0,1]    [0,1]    (1,2]    [0,1]    (1,2]    (2,10]  
## [153] [0,1]    [0,1]    [0,1]    [0,1]    (35,350] (10,35]  (35,350] (10,35] 
## [161] (2,10]   (35,350] (2,10]   [0,1]    (10,35]  (2,10]   (10,35]  (10,35] 
## [169] (35,350] (10,35]  (10,35]  (35,350] (35,350] (10,35]  (35,350] (10,35] 
## [177] (10,35]  (2,10]   (35,350] (10,35]  (10,35]  (10,35]  (35,350] (35,350]
## [185] [0,1]    (10,35]  (35,350] (10,35]  (10,35]  (2,10]   (35,350] (10,35] 
## [193] (35,350] (10,35]  (35,350] (10,35]  (35,350] (10,35]  (35,350] [0,1]   
## [201] (10,35]  (35,350] (35,350] [0,1]    [0,1]    (2,10]   (35,350] (2,10]  
## [209] (1,2]    (2,10]   (10,35]  (2,10]   [0,1]    (2,10]   (35,350] (10,35] 
## [217] (35,350] (10,35]  (35,350] (10,35]  (1,2]    (10,35]  (2,10]   (35,350]
## [225] (2,10]   (10,35]  (10,35]  (10,35]  [0,1]    (10,35]  (10,35]  (10,35] 
## [233] (35,350] (10,35]  (2,10]   (2,10]   [0,1]    [0,1]    [0,1]    [0,1]   
## [241] (2,10]   (1,2]    [0,1]    [0,1]    [0,1]    (2,10]   (10,35]  [0,1]   
## [249] [0,1]    (2,10]   [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]   
## [257] [0,1]    [0,1]    [0,1]    [0,1]    (35,350] (2,10]   [0,1]    (35,350]
## [265] (2,10]   (35,350] (2,10]   (1,2]    (35,350] (2,10]   (2,10]   (2,10]  
## [273] (1,2]    (10,35]  (2,10]   (10,35]  (35,350] (2,10]   (35,350] (35,350]
## [281] (2,10]   (2,10]   (35,350] (10,35]  (2,10]   (35,350] (10,35]  (2,10]  
## [289] (1,2]    [0,1]    (2,10]   [0,1]    [0,1]    [0,1]    [0,1]    [0,1]   
## [297] (1,2]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]   
## [305] [0,1]    (1,2]    [0,1]    [0,1]    [0,1]    [0,1]    (1,2]    [0,1]   
## [313] [0,1]    (1,2]    [0,1]    [0,1]    [0,1]    [0,1]    (1,2]    [0,1]   
## [321] [0,1]    [0,1]    [0,1]    [0,1]    (1,2]    [0,1]    [0,1]    [0,1]   
## [329] [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]   
## [337] [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]   
## [345] [0,1]    (2,10]   [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]   
## [353] [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    (1,2]    [0,1]   
## [361] [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]   
## [369] [0,1]    [0,1]    (1,2]    [0,1]    [0,1]    [0,1]    (35,350] (10,35] 
## [377] (35,350] (35,350] (35,350] (10,35]  (35,350] (35,350] (35,350] (35,350]
## [385] (2,10]   (2,10]   (2,10]   (10,35]  (35,350] (10,35]  (10,35]  (10,35] 
## [393] (10,35]  (10,35]  (10,35]  (10,35]  (35,350] (10,35]  (35,350] (2,10]  
## [401] (10,35]  (1,2]    (10,35]  (35,350] (1,2]    [0,1]    (2,10]   [0,1]   
## [409] (2,10]   (1,2]   
## Levels: [0,1] (1,2] (2,10] (10,35] (35,350]
# Set the categories for interval_type to "(lo, hi]" and "[lo, hi)"
cut_by_quantile <- function(x, n = 5, na.rm = FALSE, labels = NULL, 
                            interval_type=c("(lo, hi]", "[lo, hi)")
                            ) {
    # Match the interval_type argument
    interval_type <- match.arg(interval_type, c("(lo, hi]", "[lo, hi)"))
    probs <- seq(0, 1, length.out = n + 1)
    qtiles <- quantile(x, probs, na.rm = na.rm, names = FALSE)
    right <- switch(interval_type, "(lo, hi]" = TRUE, "[lo, hi)" = FALSE)
    cut(x, qtiles, labels = labels, right = right, include.lowest = TRUE)
}

# Remove the interval_type argument from the call
cut_by_quantile(n_visits)
##   [1] [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]   
##   [9] [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]   
##  [17] [0,1]    [0,1]    [0,1]    (10,35]  (35,350] (10,35]  [0,1]    (2,10]  
##  [25] (1,2]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    (35,350]
##  [33] (35,350] (35,350] (35,350] (35,350] (10,35]  (35,350] (35,350] (35,350]
##  [41] (35,350] (35,350] (2,10]   (35,350] (35,350] (35,350] (2,10]   (2,10]  
##  [49] (1,2]    (10,35]  (10,35]  (10,35]  (35,350] (35,350] (10,35]  (10,35] 
##  [57] (35,350] (2,10]   (35,350] (10,35]  (2,10]   (10,35]  (35,350] (35,350]
##  [65] (35,350] (35,350] (10,35]  (10,35]  (35,350] (10,35]  [0,1]    (35,350]
##  [73] (10,35]  (10,35]  (2,10]   (10,35]  (10,35]  (10,35]  (2,10]   (35,350]
##  [81] (35,350] (35,350] (10,35]  (10,35]  (10,35]  [0,1]    (35,350] (2,10]  
##  [89] (10,35]  (35,350] (10,35]  (10,35]  (35,350] (10,35]  [0,1]    [0,1]   
##  [97] (2,10]   [0,1]    (2,10]   (2,10]   (35,350] (2,10]   (2,10]   (2,10]  
## [105] (10,35]  (1,2]    (10,35]  (35,350] (2,10]   (35,350] (2,10]   (35,350]
## [113] (1,2]    (35,350] (1,2]    (35,350] (10,35]  [0,1]    [0,1]    [0,1]   
## [121] [0,1]    (1,2]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]   
## [129] (2,10]   [0,1]    [0,1]    (1,2]    (1,2]    [0,1]    [0,1]    (1,2]   
## [137] [0,1]    [0,1]    [0,1]    (1,2]    (1,2]    (2,10]   (2,10]   (2,10]  
## [145] (2,10]   (2,10]   [0,1]    [0,1]    (1,2]    [0,1]    (1,2]    (2,10]  
## [153] [0,1]    [0,1]    [0,1]    [0,1]    (35,350] (10,35]  (35,350] (10,35] 
## [161] (2,10]   (35,350] (2,10]   [0,1]    (10,35]  (2,10]   (10,35]  (10,35] 
## [169] (35,350] (10,35]  (10,35]  (35,350] (35,350] (10,35]  (35,350] (10,35] 
## [177] (10,35]  (2,10]   (35,350] (10,35]  (10,35]  (10,35]  (35,350] (35,350]
## [185] [0,1]    (10,35]  (35,350] (10,35]  (10,35]  (2,10]   (35,350] (10,35] 
## [193] (35,350] (10,35]  (35,350] (10,35]  (35,350] (10,35]  (35,350] [0,1]   
## [201] (10,35]  (35,350] (35,350] [0,1]    [0,1]    (2,10]   (35,350] (2,10]  
## [209] (1,2]    (2,10]   (10,35]  (2,10]   [0,1]    (2,10]   (35,350] (10,35] 
## [217] (35,350] (10,35]  (35,350] (10,35]  (1,2]    (10,35]  (2,10]   (35,350]
## [225] (2,10]   (10,35]  (10,35]  (10,35]  [0,1]    (10,35]  (10,35]  (10,35] 
## [233] (35,350] (10,35]  (2,10]   (2,10]   [0,1]    [0,1]    [0,1]    [0,1]   
## [241] (2,10]   (1,2]    [0,1]    [0,1]    [0,1]    (2,10]   (10,35]  [0,1]   
## [249] [0,1]    (2,10]   [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]   
## [257] [0,1]    [0,1]    [0,1]    [0,1]    (35,350] (2,10]   [0,1]    (35,350]
## [265] (2,10]   (35,350] (2,10]   (1,2]    (35,350] (2,10]   (2,10]   (2,10]  
## [273] (1,2]    (10,35]  (2,10]   (10,35]  (35,350] (2,10]   (35,350] (35,350]
## [281] (2,10]   (2,10]   (35,350] (10,35]  (2,10]   (35,350] (10,35]  (2,10]  
## [289] (1,2]    [0,1]    (2,10]   [0,1]    [0,1]    [0,1]    [0,1]    [0,1]   
## [297] (1,2]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]   
## [305] [0,1]    (1,2]    [0,1]    [0,1]    [0,1]    [0,1]    (1,2]    [0,1]   
## [313] [0,1]    (1,2]    [0,1]    [0,1]    [0,1]    [0,1]    (1,2]    [0,1]   
## [321] [0,1]    [0,1]    [0,1]    [0,1]    (1,2]    [0,1]    [0,1]    [0,1]   
## [329] [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]   
## [337] [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]   
## [345] [0,1]    (2,10]   [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]   
## [353] [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    (1,2]    [0,1]   
## [361] [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]    [0,1]   
## [369] [0,1]    [0,1]    (1,2]    [0,1]    [0,1]    [0,1]    (35,350] (10,35] 
## [377] (35,350] (35,350] (35,350] (10,35]  (35,350] (35,350] (35,350] (35,350]
## [385] (2,10]   (2,10]   (2,10]   (10,35]  (35,350] (10,35]  (10,35]  (10,35] 
## [393] (10,35]  (10,35]  (10,35]  (10,35]  (35,350] (10,35]  (35,350] (2,10]  
## [401] (10,35]  (1,2]    (10,35]  (35,350] (1,2]    [0,1]    (2,10]   [0,1]   
## [409] (2,10]   (1,2]   
## Levels: [0,1] (1,2] (2,10] (10,35] (35,350]
std_and_poor500 <- readRDS("./RInputFiles/std_and_poor500_with_pe_2019-06-21.rds")
glimpse(std_and_poor500)
## Observations: 505
## Variables: 5
## $ symbol   <chr> "MMM", "ABT", "ABBV", "ABMD", "ACN", "ATVI", "ADBE", "AMD"...
## $ company  <chr> "3M Company", "Abbott Laboratories", "AbbVie Inc.", "ABIOM...
## $ sector   <chr> "Industrials", "Health Care", "Health Care", "Health Care"...
## $ industry <chr> "Industrial Conglomerates", "Health Care Equipment", "Phar...
## $ pe_ratio <dbl> 18.31678, 57.66621, 22.43805, 45.63993, 27.00233, 20.13596...
# From previous steps
get_reciprocal <- function(x) {
    1 / x
}

calc_harmonic_mean <- function(x) {
    x %>%
        get_reciprocal() %>%
        mean(na.rm=TRUE) %>%
        get_reciprocal()
}

std_and_poor500 %>% 
    # Group by sector
    group_by(sector) %>% 
    # Summarize, calculating harmonic mean of P/E ratio
    summarize(hmean_pe_ratio = calc_harmonic_mean(pe_ratio))
## # A tibble: 11 x 2
##    sector                 hmean_pe_ratio
##    <chr>                           <dbl>
##  1 Communication Services           17.5
##  2 Consumer Discretionary           15.2
##  3 Consumer Staples                 19.8
##  4 Energy                           13.7
##  5 Financials                       12.9
##  6 Health Care                      26.6
##  7 Industrials                      18.2
##  8 Information Technology           21.6
##  9 Materials                        16.3
## 10 Real Estate                      32.5
## 11 Utilities                        23.9
# From previous step
calc_harmonic_mean <- function(x, na.rm = FALSE) {
    x %>%
        get_reciprocal() %>%
        mean(na.rm = na.rm) %>%
        get_reciprocal()
}

std_and_poor500 %>% 
    # Group by sector
    group_by(sector) %>% 
    # Summarize, calculating harmonic mean of P/E ratio
    summarize(hmean_pe_ratio = calc_harmonic_mean(pe_ratio, na.rm=TRUE))
## # A tibble: 11 x 2
##    sector                 hmean_pe_ratio
##    <chr>                           <dbl>
##  1 Communication Services           17.5
##  2 Consumer Discretionary           15.2
##  3 Consumer Staples                 19.8
##  4 Energy                           13.7
##  5 Financials                       12.9
##  6 Health Care                      26.6
##  7 Industrials                      18.2
##  8 Information Technology           21.6
##  9 Materials                        16.3
## 10 Real Estate                      32.5
## 11 Utilities                        23.9
calc_harmonic_mean <- function(x, ...) {
    x %>%
        get_reciprocal() %>%
        mean(...) %>%
        get_reciprocal()
}

std_and_poor500 %>% 
    # Group by sector
    group_by(sector) %>% 
    # Summarize, calculating harmonic mean of P/E ratio
    summarize(hmean_pe_ratio=calc_harmonic_mean(pe_ratio, na.rm=TRUE))
## # A tibble: 11 x 2
##    sector                 hmean_pe_ratio
##    <chr>                           <dbl>
##  1 Communication Services           17.5
##  2 Consumer Discretionary           15.2
##  3 Consumer Staples                 19.8
##  4 Energy                           13.7
##  5 Financials                       12.9
##  6 Health Care                      26.6
##  7 Industrials                      18.2
##  8 Information Technology           21.6
##  9 Materials                        16.3
## 10 Real Estate                      32.5
## 11 Utilities                        23.9
calc_harmonic_mean <- function(x, na.rm = FALSE) {
    # Assert that x is numeric
    assertive.types::assert_is_numeric(x)
    x %>%
        get_reciprocal() %>%
        mean(na.rm = na.rm) %>%
        get_reciprocal()
}

# See what happens when you pass it strings (bombs out, as it should)
# calc_harmonic_mean(std_and_poor500$sector)


calc_harmonic_mean <- function(x, na.rm = FALSE) {
    assertive.types::assert_is_numeric(x)
    # Check if any values of x are non-positive
    if(any(assertive.numbers::is_non_positive(x), na.rm = TRUE)) {
        # Throw an error
        stop("x contains non-positive values, so the harmonic mean makes no sense.")
    }
    x %>%
        get_reciprocal() %>%
        mean(na.rm = na.rm) %>%
        get_reciprocal()
}

# See what happens when you pass it negative numbers (bombs out as it should)
# calc_harmonic_mean(std_and_poor500$pe_ratio - 20)


# Update the function definition to fix the na.rm argument
calc_harmonic_mean <- function(x, na.rm = FALSE) {
    assertive.types::assert_is_numeric(x)
    if(any(assertive.numbers::is_non_positive(x), na.rm = TRUE)) {
        stop("x contains non-positive values, so the harmonic mean makes no sense.")
    }
    # Use the first value of na.rm, and coerce to logical
    na.rm <- assertive.base::coerce_to(assertive.base::use_first(na.rm), "logical")
    x %>%
        get_reciprocal() %>%
        mean(na.rm = na.rm) %>%
        get_reciprocal()
}

# See what happens when you pass it malformed na.rm
calc_harmonic_mean(std_and_poor500$pe_ratio, na.rm = 1:5)
## Warning: Only the first value of na.rm (= 1) will be used.
## Warning: Coercing assertive.base::use_first(na.rm) to class 'logical'.
## [1] 18.23871

Chapter 3 - Return Values and Scope

Returning Values from Functions:

  • Functions return the last value after the end of the function is reached
  • It can sometimes be helpful to return prior to the end of the function - e.g., any NA means the result is NA, so can stop
    • Can use return(myObject) to return myObject AND ALSO end the function
    • Can return(NaN) to return ‘not a number’ rather than throwing an error if the operation is not sensible
  • Plots typically invisibly return, so the function is entirely a side effect
  • Can assign a plot to an object, and explore the elements inside using str()

Returning Multiple Values from Functions:

  • Functions in R return only a single value, though there are workarounds - return a list, or return an object with attributes
    • list(a, b, c) as the final command will return a list with a, b, c
  • Can run multi-assignment using the zeallot package
    • c(a, b, c) %<-% session() # assumes that session returned a list
  • Can also set attributes on objects such as vectors
    • attributes(x)
    • setNames(x, myNames)
    • attr(x, “names”) # will pull the names attribute
    • attr(x, “names”) <- myNewNames # will make myNewNames the names attribute
  • As an example, dplyr::group_by() adds attributes to a list to specify the group each record belongs to
  • If you need results to have a particular type, use attributes; otherwise, return lists

Environments:

  • Environments are like lists - variables that store variables
  • Environments have parents - can consider them to be inside their parent
    • parent <- parent.env(myEnvironment)
    • environmentName(parent)
    • grandparent <- parent.env(parent)
  • The search() call will show all the environments that are currently available - mostly, the loaded packages
  • Can use the exists() function to test whether a variable exists in an environment
    • exists(“founding_year”, envir=myEnvir)
    • Any time the variable can not be found in the current environment, R will search all possible parent directories to try to find it
    • exists(“founding_year”, envir=myEnvir, inherits=FALSE) # will override the default of continuing to search parent environments and only search the given environment

Scope and Precedence:

  • Any time a function is created, it has an environment
    • If a variable is not defined in the function’s environment, it will look to the parent (calling) environment to try to find it
    • The function’s environment is NOT inherited to its parent, so variables formed in the function are not accessible outside the function

Example code includes:

is_leap_year <- function(year) {
    # If year is div. by 400 return TRUE
    if(year %% 400 == 0) {
        return(TRUE)
    }
    # If year is div. by 100 return FALSE
    if(year %% 100 == 0) {
        return(FALSE)
    }  
    # If year is div. by 4 return TRUE
    if(year %% 4 == 0) {
        return(TRUE)
    }
    # Otherwise return FALSE
    return(FALSE)
}


cars <- data.frame(speed=c(4, 4, 7, 7, 8, 9, 10, 10, 10, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 16, 16, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 20, 20, 20, 20, 20, 22, 23, 24, 24, 24, 24, 25), 
                   dist=c(2, 10, 4, 22, 16, 10, 18, 26, 34, 17, 28, 14, 20, 24, 28, 26, 34, 34, 46, 26, 36, 60, 80, 20, 26, 54, 32, 40, 32, 40, 50, 42, 56, 76, 84, 36, 46, 68, 32, 48, 52, 56, 64, 66, 54, 70, 92, 93, 120, 85)
                   )
str(cars)
## 'data.frame':    50 obs. of  2 variables:
##  $ speed: num  4 4 7 7 8 9 10 10 10 11 ...
##  $ dist : num  2 10 4 22 16 10 18 26 34 17 ...
# Using cars, draw a scatter plot of dist vs. speed
plt_dist_vs_speed <- plot(dist ~ speed, data = cars)

# Oh no! The plot object is NULL
plt_dist_vs_speed
## NULL
# Define a scatter plot fn with data and formula args
pipeable_plot <- function(data, formula) {
    # Call plot() with the formula interface
    plot(formula, data)
    # Invisibly return the input dataset
    invisible(data)
}

# Draw the scatter plot of dist vs. speed again
plt_dist_vs_speed <- cars %>% 
    pipeable_plot(dist ~ speed)

# Now the plot object has a value
plt_dist_vs_speed
##    speed dist
## 1      4    2
## 2      4   10
## 3      7    4
## 4      7   22
## 5      8   16
## 6      9   10
## 7     10   18
## 8     10   26
## 9     10   34
## 10    11   17
## 11    11   28
## 12    12   14
## 13    12   20
## 14    12   24
## 15    12   28
## 16    13   26
## 17    13   34
## 18    13   34
## 19    13   46
## 20    14   26
## 21    14   36
## 22    14   60
## 23    14   80
## 24    15   20
## 25    15   26
## 26    15   54
## 27    16   32
## 28    16   40
## 29    17   32
## 30    17   40
## 31    17   50
## 32    18   42
## 33    18   56
## 34    18   76
## 35    18   84
## 36    19   36
## 37    19   46
## 38    19   68
## 39    20   32
## 40    20   48
## 41    20   52
## 42    20   56
## 43    20   64
## 44    22   66
## 45    23   54
## 46    24   70
## 47    24   92
## 48    24   93
## 49    24  120
## 50    25   85
# Look at the structure of model (it's a mess!)
str(model)
## List of 31
##  $ coefficients     : Named num [1:7] 4.0864 0.374 -0.0199 -0.5807 -0.5782 ...
##   ..- attr(*, "names")= chr [1:7] "(Intercept)" "genderfemale" "income($25k,$55k]" "income($55k,$95k]" ...
##  $ residuals        : Named num [1:346] -0.535 -0.768 -0.944 -0.662 -0.767 ...
##   ..- attr(*, "names")= chr [1:346] "25" "26" "27" "29" ...
##  $ fitted.values    : Named num [1:346] 4.3 4.3 17.83 2.96 4.29 ...
##   ..- attr(*, "names")= chr [1:346] "25" "26" "27" "29" ...
##  $ effects          : Named num [1:346] -360 -29.2 20.3 -10 23.4 ...
##   ..- attr(*, "names")= chr [1:346] "(Intercept)" "genderfemale" "income($25k,$55k]" "income($55k,$95k]" ...
##  $ R                : num [1:7, 1:7] -97.4 0 0 0 0 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:7] "(Intercept)" "genderfemale" "income($25k,$55k]" "income($55k,$95k]" ...
##   .. ..$ : chr [1:7] "(Intercept)" "genderfemale" "income($25k,$55k]" "income($55k,$95k]" ...
##  $ rank             : int 7
##  $ qr               :List of 5
##   ..$ qr   : num [1:346, 1:7] -97.3861 0.0213 0.0434 0.0177 0.0213 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:346] "25" "26" "27" "29" ...
##   .. .. ..$ : chr [1:7] "(Intercept)" "genderfemale" "income($25k,$55k]" "income($55k,$95k]" ...
##   ..$ rank : int 7
##   ..$ qraux: num [1:7] 1.02 1.02 1.04 1.01 1 ...
##   ..$ pivot: int [1:7] 1 2 3 4 5 6 7
##   ..$ tol  : num 1e-11
##   ..- attr(*, "class")= chr "qr"
##  $ family           :List of 12
##   ..$ family    : chr "poisson"
##   ..$ link      : chr "log"
##   ..$ linkfun   :function (mu)  
##   ..$ linkinv   :function (eta)  
##   ..$ variance  :function (mu)  
##   ..$ dev.resids:function (y, mu, wt)  
##   ..$ aic       :function (y, n, mu, wt, dev)  
##   ..$ mu.eta    :function (eta)  
##   ..$ initialize:  expression({  if (any(y < 0))  stop("negative values not allowed for the 'Poisson' family")  n <- rep.int(1, nobs| __truncated__
##   ..$ validmu   :function (mu)  
##   ..$ valideta  :function (eta)  
##   ..$ simulate  :function (object, nsim)  
##   ..- attr(*, "class")= chr "family"
##  $ linear.predictors: Named num [1:346] 1.46 1.46 2.88 1.09 1.46 ...
##   ..- attr(*, "names")= chr [1:346] "25" "26" "27" "29" ...
##  $ deviance         : num 11529
##  $ aic              : num 12864
##  $ null.deviance    : num 18850
##  $ iter             : int 6
##  $ weights          : Named num [1:346] 4.3 4.3 17.83 2.96 4.29 ...
##   ..- attr(*, "names")= chr [1:346] "25" "26" "27" "29" ...
##  $ prior.weights    : Named num [1:346] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "names")= chr [1:346] "25" "26" "27" "29" ...
##  $ df.residual      : int 339
##  $ df.null          : int 345
##  $ y                : Named num [1:346] 2 1 1 1 1 1 80 104 55 350 ...
##   ..- attr(*, "names")= chr [1:346] "25" "26" "27" "29" ...
##  $ converged        : logi TRUE
##  $ boundary         : logi FALSE
##  $ model            :'data.frame':   346 obs. of  4 variables:
##   ..$ n_visits: num [1:346] 2 1 1 1 1 1 80 104 55 350 ...
##   ..$ gender  : Factor w/ 2 levels "male","female": 2 2 1 1 2 1 2 2 1 2 ...
##   ..$ income  : Factor w/ 4 levels "[$0,$25k]","($25k,$55k]",..: 4 4 4 4 3 1 1 4 2 2 ...
##   ..$ travel  : Factor w/ 3 levels "[0h,0.25h]","(0.25h,4h]",..: 3 3 2 3 3 1 1 1 2 1 ...
##   ..- attr(*, "terms")=Classes 'terms', 'formula'  language n_visits ~ gender + income + travel
##   .. .. ..- attr(*, "variables")= language list(n_visits, gender, income, travel)
##   .. .. ..- attr(*, "factors")= int [1:4, 1:3] 0 1 0 0 0 0 1 0 0 0 ...
##   .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. ..$ : chr [1:4] "n_visits" "gender" "income" "travel"
##   .. .. .. .. ..$ : chr [1:3] "gender" "income" "travel"
##   .. .. ..- attr(*, "term.labels")= chr [1:3] "gender" "income" "travel"
##   .. .. ..- attr(*, "order")= int [1:3] 1 1 1
##   .. .. ..- attr(*, "intercept")= int 1
##   .. .. ..- attr(*, "response")= int 1
##   .. .. ..- attr(*, ".Environment")=<environment: 0x000000001b60c400> 
##   .. .. ..- attr(*, "predvars")= language list(n_visits, gender, income, travel)
##   .. .. ..- attr(*, "dataClasses")= Named chr [1:4] "numeric" "factor" "factor" "factor"
##   .. .. .. ..- attr(*, "names")= chr [1:4] "n_visits" "gender" "income" "travel"
##   ..- attr(*, "na.action")= 'omit' Named int [1:64] 1 2 3 4 5 6 7 8 9 10 ...
##   .. ..- attr(*, "names")= chr [1:64] "1" "2" "3" "4" ...
##  $ na.action        : 'omit' Named int [1:64] 1 2 3 4 5 6 7 8 9 10 ...
##   ..- attr(*, "names")= chr [1:64] "1" "2" "3" "4" ...
##  $ call             : language glm(formula = formula, family = poisson, data = data)
##  $ formula          :Class 'formula'  language n_visits ~ gender + income + travel
##   .. ..- attr(*, ".Environment")=<environment: 0x000000001b60c400> 
##  $ terms            :Classes 'terms', 'formula'  language n_visits ~ gender + income + travel
##   .. ..- attr(*, "variables")= language list(n_visits, gender, income, travel)
##   .. ..- attr(*, "factors")= int [1:4, 1:3] 0 1 0 0 0 0 1 0 0 0 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:4] "n_visits" "gender" "income" "travel"
##   .. .. .. ..$ : chr [1:3] "gender" "income" "travel"
##   .. ..- attr(*, "term.labels")= chr [1:3] "gender" "income" "travel"
##   .. ..- attr(*, "order")= int [1:3] 1 1 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: 0x000000001b60c400> 
##   .. ..- attr(*, "predvars")= language list(n_visits, gender, income, travel)
##   .. ..- attr(*, "dataClasses")= Named chr [1:4] "numeric" "factor" "factor" "factor"
##   .. .. ..- attr(*, "names")= chr [1:4] "n_visits" "gender" "income" "travel"
##  $ data             :'data.frame':   410 obs. of  4 variables:
##   ..$ n_visits: num [1:410] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ gender  : Factor w/ 2 levels "male","female": 1 1 1 2 1 2 2 2 1 1 ...
##   ..$ income  : Factor w/ 4 levels "[$0,$25k]","($25k,$55k]",..: 4 2 4 2 4 2 4 4 4 4 ...
##   ..$ travel  : Factor w/ 3 levels "[0h,0.25h]","(0.25h,4h]",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ offset           : NULL
##  $ control          :List of 3
##   ..$ epsilon: num 1e-08
##   ..$ maxit  : num 25
##   ..$ trace  : logi FALSE
##  $ method           : chr "glm.fit"
##  $ contrasts        :List of 3
##   ..$ gender: chr "contr.treatment"
##   ..$ income: chr "contr.treatment"
##   ..$ travel: chr "contr.treatment"
##  $ xlevels          :List of 3
##   ..$ gender: chr [1:2] "male" "female"
##   ..$ income: chr [1:4] "[$0,$25k]" "($25k,$55k]" "($55k,$95k]" "($95k,$Inf)"
##   ..$ travel: chr [1:3] "[0h,0.25h]" "(0.25h,4h]" "(4h,Infh)"
##  - attr(*, "class")= chr [1:2] "glm" "lm"
# Use broom tools to get a list of 3 data frames
list(
    # Get model-level values
    model = broom::glance(model),
    # Get coefficient-level values
    coefficients = broom::tidy(model),
    # Get observation-level values
    observations = broom::augment(model)
)
## $model
## # A tibble: 1 x 7
##   null.deviance df.null logLik    AIC    BIC deviance df.residual
##           <dbl>   <int>  <dbl>  <dbl>  <dbl>    <dbl>       <int>
## 1        18850.     345 -6425. 12864. 12891.   11529.         339
## 
## $coefficients
## # A tibble: 7 x 5
##   term              estimate std.error statistic   p.value
##   <chr>                <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)         4.09      0.0279   146.    0.       
## 2 genderfemale        0.374     0.0212    17.6   2.18e- 69
## 3 income($25k,$55k]  -0.0199    0.0267    -0.746 4.56e-  1
## 4 income($55k,$95k]  -0.581     0.0343   -16.9   3.28e- 64
## 5 income($95k,$Inf)  -0.578     0.0310   -18.7   6.88e- 78
## 6 travel(0.25h,4h]   -0.627     0.0217   -28.8   5.40e-183
## 7 travel(4h,Infh)    -2.42      0.0492   -49.3   0.       
## 
## $observations
## # A tibble: 346 x 12
##    .rownames n_visits gender income travel .fitted .se.fit  .resid    .hat
##    <chr>        <dbl> <fct>  <fct>  <fct>    <dbl>   <dbl>   <dbl>   <dbl>
##  1 25               2 female ($95k~ (4h,I~    1.46  0.0502  -1.24  0.0109 
##  2 26               1 female ($95k~ (4h,I~    1.46  0.0502  -1.92  0.0109 
##  3 27               1 male   ($95k~ (0.25~    2.88  0.0269  -5.28  0.0129 
##  4 29               1 male   ($95k~ (4h,I~    1.09  0.0490  -1.32  0.00711
##  5 30               1 female ($55k~ (4h,I~    1.46  0.0531  -1.92  0.0121 
##  6 31               1 male   [$0,$~ [0h,0~    4.09  0.0279 -10.4   0.0465 
##  7 33              80 female [$0,$~ [0h,0~    4.46  0.0235  -0.710 0.0479 
##  8 34             104 female ($95k~ [0h,0~    3.88  0.0261   6.90  0.0332 
##  9 35              55 male   ($25k~ (0.25~    3.44  0.0222   3.85  0.0153 
## 10 36             350 female ($25k~ [0h,0~    4.44  0.0206  21.5   0.0360 
## # ... with 336 more rows, and 3 more variables: .sigma <dbl>, .cooksd <dbl>,
## #   .std.resid <dbl>
# From previous step
groom_model <- function(model) {
    list(
        model = broom::glance(model),
        coefficients = broom::tidy(model),
        observations = broom::augment(model)
    )
}


library(zeallot)  # needed for %<-%

# Call groom_model on model, assigning to 3 variables
c(mdl, cff, obs) %<-% groom_model(model)

# See these individual variables
mdl; cff; obs
## # A tibble: 1 x 7
##   null.deviance df.null logLik    AIC    BIC deviance df.residual
##           <dbl>   <int>  <dbl>  <dbl>  <dbl>    <dbl>       <int>
## 1        18850.     345 -6425. 12864. 12891.   11529.         339
## # A tibble: 7 x 5
##   term              estimate std.error statistic   p.value
##   <chr>                <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)         4.09      0.0279   146.    0.       
## 2 genderfemale        0.374     0.0212    17.6   2.18e- 69
## 3 income($25k,$55k]  -0.0199    0.0267    -0.746 4.56e-  1
## 4 income($55k,$95k]  -0.581     0.0343   -16.9   3.28e- 64
## 5 income($95k,$Inf)  -0.578     0.0310   -18.7   6.88e- 78
## 6 travel(0.25h,4h]   -0.627     0.0217   -28.8   5.40e-183
## 7 travel(4h,Infh)    -2.42      0.0492   -49.3   0.
## # A tibble: 346 x 12
##    .rownames n_visits gender income travel .fitted .se.fit  .resid    .hat
##    <chr>        <dbl> <fct>  <fct>  <fct>    <dbl>   <dbl>   <dbl>   <dbl>
##  1 25               2 female ($95k~ (4h,I~    1.46  0.0502  -1.24  0.0109 
##  2 26               1 female ($95k~ (4h,I~    1.46  0.0502  -1.92  0.0109 
##  3 27               1 male   ($95k~ (0.25~    2.88  0.0269  -5.28  0.0129 
##  4 29               1 male   ($95k~ (4h,I~    1.09  0.0490  -1.32  0.00711
##  5 30               1 female ($55k~ (4h,I~    1.46  0.0531  -1.92  0.0121 
##  6 31               1 male   [$0,$~ [0h,0~    4.09  0.0279 -10.4   0.0465 
##  7 33              80 female [$0,$~ [0h,0~    4.46  0.0235  -0.710 0.0479 
##  8 34             104 female ($95k~ [0h,0~    3.88  0.0261   6.90  0.0332 
##  9 35              55 male   ($25k~ (0.25~    3.44  0.0222   3.85  0.0153 
## 10 36             350 female ($25k~ [0h,0~    4.44  0.0206  21.5   0.0360 
## # ... with 336 more rows, and 3 more variables: .sigma <dbl>, .cooksd <dbl>,
## #   .std.resid <dbl>
pipeable_plot <- function(data, formula) {
    plot(formula, data)
    # Add a "formula" attribute to data
    attr(data, "formula") <- formula
    invisible(data)
}

# From previous exercise
plt_dist_vs_speed <- cars %>% 
    pipeable_plot(dist ~ speed)

# Examine the structure of the result
str(plt_dist_vs_speed)
## 'data.frame':    50 obs. of  2 variables:
##  $ speed: num  4 4 7 7 8 9 10 10 10 11 ...
##  $ dist : num  2 10 4 22 16 10 18 26 34 17 ...
##  - attr(*, "formula")=Class 'formula'  language dist ~ speed
##   .. ..- attr(*, ".Environment")=<environment: 0x000000001d791228>
capitals <- tibble::tibble(city=c("Cape Town", "Bloemfontein", "Pretoria"), 
                           type_of_capital=c("Legislative", "Judicial", "Administrative")
                           )
national_parks <- c('Addo Elephant National Park', 'Agulhas National Park', 'Ai-Ais/Richtersveld Transfrontier Park', 'Augrabies Falls National Park', 'Bontebok National Park', 'Camdeboo National Park', 'Golden Gate Highlands National Park', 'Hluhluwe–Imfolozi Park', 'Karoo National Park', 'Kgalagadi Transfrontier Park', 'Knysna National Lake Area', 'Kruger National Park', 'Mapungubwe National Park', 'Marakele National Park', 'Mokala National Park', 'Mountain Zebra National Park', 'Namaqua National Park', 'Table Mountain National Park', 'Tankwa Karoo National Park', 'Tsitsikamma National Park', 'West Coast National Park', 'Wilderness National Park')
population <- ts(c(40583573, 44819778, 47390900, 51770560, 55908900), start=1996, end=2016, deltat=5)

capitals
## # A tibble: 3 x 2
##   city         type_of_capital
##   <chr>        <chr>          
## 1 Cape Town    Legislative    
## 2 Bloemfontein Judicial       
## 3 Pretoria     Administrative
national_parks
##  [1] "Addo Elephant National Park"           
##  [2] "Agulhas National Park"                 
##  [3] "Ai-Ais/Richtersveld Transfrontier Park"
##  [4] "Augrabies Falls National Park"         
##  [5] "Bontebok National Park"                
##  [6] "Camdeboo National Park"                
##  [7] "Golden Gate Highlands National Park"   
##  [8] "Hluhluwe–Imfolozi Park"                
##  [9] "Karoo National Park"                   
## [10] "Kgalagadi Transfrontier Park"          
## [11] "Knysna National Lake Area"             
## [12] "Kruger National Park"                  
## [13] "Mapungubwe National Park"              
## [14] "Marakele National Park"                
## [15] "Mokala National Park"                  
## [16] "Mountain Zebra National Park"          
## [17] "Namaqua National Park"                 
## [18] "Table Mountain National Park"          
## [19] "Tankwa Karoo National Park"            
## [20] "Tsitsikamma National Park"             
## [21] "West Coast National Park"              
## [22] "Wilderness National Park"
population
## Time Series:
## Start = 1996 
## End = 2016 
## Frequency = 0.2 
## [1] 40583573 44819778 47390900 51770560 55908900
# From previous steps
rsa_lst <- list(
    capitals = capitals,
    national_parks = national_parks,
    population = population
)
rsa_env <- list2env(rsa_lst)

ls.str(rsa_lst)
## capitals : Classes 'tbl_df', 'tbl' and 'data.frame': 3 obs. of  2 variables:
##  $ city           : chr  "Cape Town" "Bloemfontein" "Pretoria"
##  $ type_of_capital: chr  "Legislative" "Judicial" "Administrative"
## national_parks :  chr [1:22] "Addo Elephant National Park" "Agulhas National Park" ...
## population :  Time-Series [1:5] from 1996 to 2016: 40583573 44819778 47390900 51770560 55908900
ls.str(rsa_env)
## capitals : Classes 'tbl_df', 'tbl' and 'data.frame': 3 obs. of  2 variables:
##  $ city           : chr  "Cape Town" "Bloemfontein" "Pretoria"
##  $ type_of_capital: chr  "Legislative" "Judicial" "Administrative"
## national_parks :  chr [1:22] "Addo Elephant National Park" "Agulhas National Park" ...
## population :  Time-Series [1:5] from 1996 to 2016: 40583573 44819778 47390900 51770560 55908900
# Find the parent environment of rsa_env
parent <- parent.env(rsa_env)

# Print its name
environmentName(parent)
## [1] "R_GlobalEnv"
# Compare the contents of the global environment and rsa_env
# ls.str(globalenv())
ls.str(rsa_env)
## capitals : Classes 'tbl_df', 'tbl' and 'data.frame': 3 obs. of  2 variables:
##  $ city           : chr  "Cape Town" "Bloemfontein" "Pretoria"
##  $ type_of_capital: chr  "Legislative" "Judicial" "Administrative"
## national_parks :  chr [1:22] "Addo Elephant National Park" "Agulhas National Park" ...
## population :  Time-Series [1:5] from 1996 to 2016: 40583573 44819778 47390900 51770560 55908900
# Does population exist in rsa_env?
exists("population", envir = rsa_env)
## [1] TRUE
# Does population exist in rsa_env, ignoring inheritance?
exists("population", envir = rsa_env, inherits=FALSE)
## [1] TRUE

Chapter 4 - Case Study on Grain Yields

Grain Yields and Conversion:

  • Historic grain yield data is from NASS (National Agricultural Statistical Service)
  • Conversions from imperial to metric units
    • 1 acre is about the amount of land that 2 oxen can plough in one day
    • 1 hectare is roughly equal to two football fields (100m x 100m)
    • 1 bushel is equal to two buckets of peaches
    • 1 kilogram is roughly equal to the mass of 1 squirrel monkey

Visualizing Grain Yields:

  • May want to explore the variability of yield over time
  • Can use a combination of dplyr inner joins and ggplot2 to explore a faceted look

Modeling Grain Yields:

  • The geom_smooth() uses the gam (generalized additive model)
    • lm(y ~ x, data=) will run the standard linear model
    • mgcv::gam(y ~ s(x1) + x2, data=) will run the gam, where the s() is running a smooth on x1
  • Can get predicted responses using predict()
    • predict(model, myNewData, type=“response”)

Wrap Up:

  • Writing functions - motivations include less re-writing and more consistency (fewer copy/paste errors)
  • Setting default arguments for functions, and using … to pass arguments between functions
  • Return values, including early return and returning multiple values
  • Case study for writing and using functions as part of analysis
  • Functions do not need to be large and complex - can be as simple as a single line

Example code includes:

library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
corn <- readRDS("./RInputFiles/nass.corn.rds")
wheat <- readRDS("./RInputFiles/nass.wheat.rds")
barley <- readRDS("./RInputFiles/nass.barley.rds")
corn <- as_tibble(corn)
wheat <- as_tibble(wheat)
barley <- as_tibble(barley)
str(corn)
## Classes 'tbl_df', 'tbl' and 'data.frame':    6381 obs. of  4 variables:
##  $ year                  : int  1866 1866 1866 1866 1866 1866 1866 1866 1866 1866 ...
##  $ state                 : chr  "Alabama" "Arkansas" "California" "Connecticut" ...
##  $ farmed_area_acres     : num  1050000 280000 42000 57000 200000 ...
##  $ yield_bushels_per_acre: num  9 18 28 34 23 9 6 29 36.5 32 ...
str(wheat)
## Classes 'tbl_df', 'tbl' and 'data.frame':    5963 obs. of  4 variables:
##  $ year                  : int  1866 1866 1866 1866 1866 1866 1866 1866 1866 1866 ...
##  $ state                 : chr  "Alabama" "Arkansas" "California" "Connecticut" ...
##  $ farmed_area_acres     : num  125000 50000 650000 2000 59000 245000 2300000 1550000 1190000 68000 ...
##  $ yield_bushels_per_acre: num  5 6.5 18 17.5 11 4 10.5 10 13 19 ...
str(barley)
## Classes 'tbl_df', 'tbl' and 'data.frame':    4839 obs. of  4 variables:
##  $ year                  : int  1866 1866 1866 1866 1866 1866 1866 1866 1866 1866 ...
##  $ state                 : chr  "Connecticut" "Illinois" "Indiana" "Iowa" ...
##  $ farmed_area_acres     : num  1000 96000 11000 66000 2000 10000 34000 7000 21000 20000 ...
##  $ yield_bushels_per_acre: num  22.5 23.4 23 22 23 23.5 21.5 25.5 26 26 ...
# Write a function to convert acres to sq. yards
acres_to_sq_yards <- function(acres) {
    acres * 4840
}

# Write a function to convert yards to meters
yards_to_meters <- function(yards) {
    yards * 36 * 0.0254
}

# Write a function to convert sq. meters to hectares
sq_meters_to_hectares <- function(sq_meters) {
    sq_meters / 10000
}


# Write a function to convert sq. yards to sq. meters
sq_yards_to_sq_meters <- function(sq_yards) {
    sq_yards %>%
        # Take the square root
        sqrt() %>%
        # Convert yards to meters
        yards_to_meters() %>%
        # Square it
        raise_to_power(2)
}

# Write a function to convert acres to hectares
acres_to_hectares <- function(acres) {
    acres %>%
        # Convert acres to sq yards
        acres_to_sq_yards() %>%
        # Convert sq yards to sq meters
        sqrt() %>%
        yards_to_meters() %>%
        raise_to_power(2) %>%
        # Convert sq meters to hectares
        sq_meters_to_hectares()
}


# Write a function to convert lb to kg
lbs_to_kgs <- function(lbs) {
    lbs * 0.45359237
}

# Write a function to convert bushels to lbs
bushels_to_lbs <- function(bushels, crop) {
    # Define a lookup table of scale factors
    c(barley = 48, corn = 56, wheat = 60, volume = 8) %>%
        # Extract the value for the crop
        extract(crop) %>%
        # Multiply by the no. of bushels
        multiply_by(bushels)
}

# Write a function to convert bushels to kg
bushels_to_kgs <- function(bushels, crop) {
    bushels %>%
        # Convert bushels to lbs
        bushels_to_lbs(crop) %>%
        # Convert lbs to kgs
        lbs_to_kgs()
}

# Write a function to convert bushels/acre to kg/ha
bushels_per_acre_to_kgs_per_hectare <- function(bushels_per_acre, crop = c("barley", "corn", "wheat")) {
    # Match the crop argument
    crop <- match.arg(crop)
    bushels_per_acre %>%
        # Convert bushels to kgs
        bushels_to_kgs(crop) %>%
        # Convert acres to ha
        acres_to_hectares()
}


# View the corn dataset
glimpse(corn)
## Observations: 6,381
## Variables: 4
## $ year                   <int> 1866, 1866, 1866, 1866, 1866, 1866, 1866, 18...
## $ state                  <chr> "Alabama", "Arkansas", "California", "Connec...
## $ farmed_area_acres      <dbl> 1050000, 280000, 42000, 57000, 200000, 12500...
## $ yield_bushels_per_acre <dbl> 9.0, 18.0, 28.0, 34.0, 23.0, 9.0, 6.0, 29.0,...
corn <- corn %>%
    # Add some columns
    mutate(
        # Convert farmed area from acres to ha
        farmed_area_ha = acres_to_hectares(farmed_area_acres),
        # Convert yield from bushels/acre to kg/ha
        yield_kg_per_ha = bushels_per_acre_to_kgs_per_hectare(yield_bushels_per_acre, crop = "corn")
    )

# Wrap this code into a function
fortify_with_metric_units <- function(data, crop) {
    data %>%
        mutate(
            farmed_area_ha = acres_to_hectares(farmed_area_acres),
            yield_kg_per_ha = bushels_per_acre_to_kgs_per_hectare(yield_bushels_per_acre, crop = crop)
        )
}

# Try it on the wheat dataset
wheat <- fortify_with_metric_units(wheat, "wheat")


# Using corn, plot yield (kg/ha) vs. year
ggplot(corn, aes(x=year, y=yield_kg_per_ha)) +
    # Add a line layer, grouped by state
    geom_line(aes(group = state)) +
    # Add a smooth trend layer
    geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# Wrap this plotting code into a function
plot_yield_vs_year <- function(data) {
    ggplot(data, aes(year, yield_kg_per_ha)) +
        geom_line(aes(group = state)) +
        geom_smooth()
}

# Test it on the wheat dataset
plot_yield_vs_year(wheat)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

usa_census_regions <- tibble::tibble(census_region=c('New England', 'New England', 'New England', 'New England', 'New England', 'New England', 'Mid-Atlantic', 'Mid-Atlantic', 'Mid-Atlantic', 'East North Central', 'East North Central', 'East North Central', 'East North Central', 'East North Central', 'West North Central', 'West North Central', 'West North Central', 'West North Central', 'West North Central', 'West North Central', 'West North Central', 'South Atlantic', 'South Atlantic', 'South Atlantic', 'South Atlantic', 'South Atlantic', 'South Atlantic', 'South Atlantic', 'South Atlantic', 'South Atlantic', 'East South Central', 'East South Central', 'East South Central', 'East South Central', 'West South Central', 'West South Central', 'West South Central', 'West South Central', 'Mountain', 'Mountain', 'Mountain', 'Mountain', 'Mountain', 'Mountain', 'Mountain', 'Mountain', 'Pacific', 'Pacific', 'Pacific', 'Pacific', 'Pacific'), 
                                     state=c('Connecticut', 'Maine', 'Massachusetts', 'New Hampshire', 'Rhode Island', 'Vermont', 'New Jersey', 'New York', 'Pennsylvania', 'Illinois', 'Indiana', 'Michigan', 'Ohio', 'Wisconsin', 'Iowa', 'Kansas', 'Minnesota', 'Missouri', 'Nebraska', 'North Dakota', 'South Dakota', 'Delaware', 'Florida', 'Georgia', 'Maryland', 'North Carolina', 'South Carolina', 'Virginia', 'District of Columbia', 'West Virginia', 'Alabama', 'Kentucky', 'Mississippi', 'Tennessee', 'Arkansas', 'Louisiana', 'Oklahoma', 'Texas', 'Arizona', 'Colorado', 'Idaho', 'Montana', 'Nevada', 'New Mexico', 'Utah', 'Wyoming', 'Alaska', 'California', 'Hawaii', 'Oregon', 'Washington')
                                     )
usa_census_regions
## # A tibble: 51 x 2
##    census_region      state        
##    <chr>              <chr>        
##  1 New England        Connecticut  
##  2 New England        Maine        
##  3 New England        Massachusetts
##  4 New England        New Hampshire
##  5 New England        Rhode Island 
##  6 New England        Vermont      
##  7 Mid-Atlantic       New Jersey   
##  8 Mid-Atlantic       New York     
##  9 Mid-Atlantic       Pennsylvania 
## 10 East North Central Illinois     
## # ... with 41 more rows
# Inner join the corn dataset to usa_census_regions by state
corn <- corn %>%
    inner_join(usa_census_regions, by = "state")

# Wrap this code into a function
fortify_with_census_region <- function(data) {
    data %>%
        inner_join(usa_census_regions, by = "state")
}

# Try it on the wheat dataset
wheat <- fortify_with_census_region(wheat)


# Plot yield vs. year for the corn dataset
plot_yield_vs_year(corn) +
    facet_wrap(~census_region)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# Wrap this code into a function
plot_yield_vs_year_by_region <- function(data) {
    plot_yield_vs_year(data) +
        facet_wrap(vars(census_region))
}

# Try it on the wheat dataset
plot_yield_vs_year_by_region(wheat)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# Wrap the model code into a function
run_gam_yield_vs_year_by_region <- function(data) {
    mgcv::gam(yield_kg_per_ha ~ s(year) + census_region, data = data)
}

# Try it on the wheat dataset
wheat_model <- run_gam_yield_vs_year_by_region(wheat)
corn_model <- run_gam_yield_vs_year_by_region(wheat)


# Make predictions in 2050  
predict_this <- data.frame(year = 2050, census_region = unique(usa_census_regions$census_region))

# Predict the yield
pred_yield_kg_per_ha <- predict(corn_model, predict_this, type = "response")

predict_this %>%
    # Add the prediction as a column of predict_this 
    mutate(pred_yield_kg_per_ha = pred_yield_kg_per_ha)
##   year      census_region pred_yield_kg_per_ha
## 1 2050        New England             901.7706
## 2 2050       Mid-Atlantic             888.5455
## 3 2050 East North Central             895.8256
## 4 2050 West North Central             816.2401
## 5 2050     South Atlantic             831.8758
## 6 2050 East South Central             816.5198
## 7 2050 West South Central             780.2498
## 8 2050           Mountain             893.8168
## 9 2050            Pacific             934.7567
# Wrap this prediction code into a function
predict_yields <- function(model, year) {
    predict_this <- data.frame(year = year, census_region = unique(usa_census_regions$census_region))
    pred_yield_kg_per_ha <- predict(model, predict_this, type = "response")
    predict_this %>%
        mutate(pred_yield_kg_per_ha = pred_yield_kg_per_ha)
}

# Try it on the wheat dataset
predict_yields(wheat_model, year=2050)
##   year      census_region pred_yield_kg_per_ha
## 1 2050        New England             901.7706
## 2 2050       Mid-Atlantic             888.5455
## 3 2050 East North Central             895.8256
## 4 2050 West North Central             816.2401
## 5 2050     South Atlantic             831.8758
## 6 2050 East South Central             816.5198
## 7 2050 West South Central             780.2498
## 8 2050           Mountain             893.8168
## 9 2050            Pacific             934.7567
# From previous step
fortified_barley <- barley %>% 
    fortify_with_metric_units(crop="barley") %>%
    fortify_with_census_region()

# Plot yield vs. year by region
plot_yield_vs_year_by_region(fortified_barley)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

fortified_barley %>% 
    # Run a GAM of yield vs. year by region
    run_gam_yield_vs_year_by_region()  %>% 
    # Make predictions of yields in 2050
    predict_yields(year=2050)
##   year      census_region pred_yield_kg_per_ha
## 1 2050        New England             692.7372
## 2 2050       Mid-Atlantic             695.6051
## 3 2050 East North Central             689.5561
## 4 2050 West North Central             629.5246
## 5 2050     South Atlantic             695.7666
## 6 2050 East South Central             657.6750
## 7 2050 West South Central             595.9212
## 8 2050           Mountain             759.6959
## 9 2050            Pacific             698.9621

Introduction to Data Visualization with ggplot2

Chapter 1 - Introduction

Introduction:

  • Data visualization is a core skill that combines statistics and design
  • Visualizations can be exploratory or explanatory - audiences and intentions are frequently different
    • Good design begins with thinking about the audience, which can be as small as the analyst themselves
    • There is frequently an iterative, fine-tuning approach to the visualization

Grammar of Graphics:

  • Graphics are built on an underlying grammar
    • “Grammar of Graphics” - Leland Wilkinson
    • Graphics are made up of distinct layers of grammatical elements
    • Meaningful plots are made through aesthetic mappings
  • There are three essential elements of graphical grammar, plus (optionally) themes, statistics, coordinates, and facets
    • Data - the dataset being plotted
    • Aesthetics - scales on to which the data are mapped
    • Geometries - visual elements used for our data
    • Themes - non-data ink

Layers:

  • The ggplot2 package implements the grammar of graphics in R
  • Can use the iris dataset for the data in the examples, with aesthetics mapping the data and the geometry specifying how to plot it
    • ggplot(iris, aes(x=, y=)) + geom_jitter()
  • Can also control the non-data ink by adding a theme, such as myplot + theme_classic()

Example code includes:

data(mtcars)

# Explore the mtcars data frame with str()
str(mtcars)
## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...
# Execute the following command
ggplot(mtcars, aes(cyl, mpg)) +
    geom_point()

# Change the command below so that cyl is treated as factor
ggplot(mtcars, aes(factor(cyl), mpg)) +
    geom_point()

# Edit to add a color aesthetic mapped to disp
ggplot(mtcars, aes(wt, mpg, color=disp)) +
    geom_point()

# Change the color aesthetic to a size aesthetic
ggplot(mtcars, aes(wt, mpg, size = disp)) +
    geom_point()

data(diamonds)
str(diamonds)
## Classes 'tbl_df', 'tbl' and 'data.frame':    53940 obs. of  10 variables:
##  $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
# Add geom_point() with +
ggplot(diamonds, aes(carat, price)) +
    geom_point()

# Add geom_smooth() with +
ggplot(diamonds, aes(carat, price)) +
    geom_point() +
    geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# Make the points 40% opaque
ggplot(diamonds, aes(carat, price, color = clarity)) +
    geom_point(alpha=0.4) +
    geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# Draw a ggplot
plt_price_vs_carat <- ggplot(
    # Use the diamonds dataset
    diamonds,
    # For the aesthetics, map x to carat and y to price
    aes(x=carat, y=price)
)

# Add a point layer to plt_price_vs_carat
plt_price_vs_carat + 
    geom_point()

# Edit this to make points 20% opaque: plt_price_vs_carat_transparent
plt_price_vs_carat_transparent <- plt_price_vs_carat + 
    geom_point(alpha=0.2)

# See the plot
plt_price_vs_carat_transparent

# Edit this to map color to clarity,
# Assign the updated plot to a new object
plt_price_vs_carat_by_clarity <- plt_price_vs_carat + 
    geom_point(aes(color=clarity))

# See the plot
plt_price_vs_carat_by_clarity


Chapter 2 - Aesthetics

Visible Aesthetics:

  • On a scatter plot, the x and y axes are aesthetics (the data are mapped on to them) and color/size are optional aesthetics
    • If an aesthetic is mapped in the main ggplot(), it will inherit to everything else in the plot
    • Other aesthetic types include fill, alpha, linetype, labels, shape

Using Attributes:

  • Attributes are how things look in ggplot2
    • By contrast, aesthetics in ggplot2 are mappings from the data
  • Attributes should always be set inside the geom, such as geom_point(color=“red”)

Modifying Aestehtics:

  • One common adjustment is position, or how overlapping points are managed
    • “identity” is the default for a scatter plot - value is exactly where the x/y map
    • “jitter” adds some random noise, either as geom_point(position=“jitter”) or with a pre-defined variable posn_j <- position_jitter(0.1, seed=136) and then geom_point(position=posn_j)
  • Scale functions can all be accessed using scale__*()
    • scale_x_continuous(limits=c(2, 8), breaks=seq(2, 8, 3))
    • scale_color_discrete()

Aesthetics Best Practices:

  • Form should follow function, which is highly audience and intention dependent
    • Accuracy and efficiency are always high priorities
    • Visually appealing is a secondary objective
  • The best aesthetics are both efficient and accurate
    • Unaligned axes make for difficult comparisons across plots
    • Overplotting can be a meaningful concern

Example code includes:

mtcars <- mtcars %>%
    mutate(fcyl=factor(cyl), fam=factor(am, levels=c(0, 1), labels=c("automatic", "manual")))
str(mtcars)
## 'data.frame':    32 obs. of  13 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...
##  $ fcyl: Factor w/ 3 levels "4","6","8": 2 2 1 2 3 2 3 1 1 2 ...
##  $ fam : Factor w/ 2 levels "automatic","manual": 2 2 2 1 1 1 1 1 1 1 ...
# Map x to mpg and y to fcyl
ggplot(mtcars, aes(x=mpg, y=fcyl)) +
    geom_point()

# Swap mpg and fcyl
ggplot(mtcars, aes(x=fcyl, y=mpg)) +
    geom_point()

# Map x to wt, y to mpg and color to fcyl
ggplot(mtcars, aes(x=wt, y=mpg, color=fcyl)) +
    geom_point()

ggplot(mtcars, aes(wt, mpg, color = fcyl)) +
    # Set the shape and size of the points
    geom_point(shape=1, size=4)

# Map color to fam
ggplot(mtcars, aes(wt, mpg, fill = fcyl, color=fam)) +
    geom_point(shape = 21, size = 4, alpha = 0.6)

# Base layer
plt_mpg_vs_wt <- ggplot(mtcars, aes(wt, mpg))

# Map fcyl to shape, not alpha
plt_mpg_vs_wt +
    geom_point(aes(shape = fcyl))

# Base layer
plt_mpg_vs_wt <- ggplot(mtcars, aes(wt, mpg))

# Use text layer and map fcyl to label
plt_mpg_vs_wt +
    geom_text(aes(label = fcyl))

# A hexadecimal color
my_blue <- "#4ABEFF"

# Change the color mapping to a fill mapping
ggplot(mtcars, aes(wt, mpg, fill = fcyl)) +
    # Set point size and shape
    geom_point(color=my_blue, size=10, shape=1)

ggplot(mtcars, aes(wt, mpg, color = fcyl)) +
    # Add point layer with alpha 0.5
    geom_point(alpha=0.5)

ggplot(mtcars, aes(wt, mpg, color = fcyl)) +
    # Add text layer with label rownames(mtcars) and color red
    geom_text(label=rownames(mtcars), color="red")

ggplot(mtcars, aes(wt, mpg, color = fcyl)) +
    # Add points layer with shape 24 and color yellow
    geom_point(shape=24, color="yellow")

# 5 aesthetics: add a mapping of size to hp / wt
ggplot(mtcars, aes(mpg, qsec, color = fcyl, shape = fam, size=hp/wt)) +
    geom_point()

ggplot(mtcars, aes(fcyl, fill = fam)) +
    geom_bar() +
    # Set the axis labels
    labs(x="Number of Cylinders", y="Count")

palette <- c(automatic = "#377EB8", manual = "#E41A1C")

ggplot(mtcars, aes(fcyl, fill = fam)) +
    geom_bar() +
    labs(x = "Number of Cylinders", y = "Count") +
    # Set the fill color scale
    scale_fill_manual("Transmission", values = palette)

palette <- c(automatic = "#377EB8", manual = "#E41A1C")

# Set the position
ggplot(mtcars, aes(fcyl, fill = fam)) +
    geom_bar(position="dodge") +
    labs(x = "Number of Cylinders", y = "Count") +
    scale_fill_manual("Transmission", values = palette)

ggplot(mtcars, aes(mpg, 0)) +
    geom_jitter() +
    # Set the y-axis limits
    ylim(c(-2, 2))


Chapter 3 - Geometries

Scatter Plots:

  • Scatter plots require an x, y aesthetic
    • Optional aestehtics/attributes include alpha, color, fill, shape, size, stroke
    • Can include a second geom_point() with a new data= argument, and all other aesthetics will inherit
    • Can use shape=1 (unfilled circle) to help with over-plotting; can also be used in combination with jitter, alpha, etc.
  • Means should generally be plotted with a measure of dispersion (spread)

Histograms:

  • Histograms are a special form of bar plot that shows distributions
    • geom_histogram(binwidth=, center=) defaults to 30 bins and can be overridden by setting the binwidth; setting center=0.5*binwidth ensure labels between bars rather than under bars
  • When multiple series are in the same histogram (e.g., with aes(fill=myCat)), the default is position=“stack”
    • Can use position=“dodge” for offsetting (can be difficult to read)
    • Can use position=“fill” for the proportions by bin rather than counts by bin

Bar Plots:

  • There are two geoms for creating bar/column charts
    • geom_bar() counts the number of cases at each x position - stat=“count”
    • geom_col() plots actual values - stat=“identity”
  • Example of creating column chart with specified y and errorbars
    • ggplot(myDF, aes(x=sex, y=avgHt)) + geom_col() + geom_errorbar(aes(ymin=avgHt-sdHt, ymax=avgHt+sdHt), width=0.1)
    • This type of plot is widely discouraged as a “Wile E Coyote” dynamite plot

Line Plots:

  • The color aesthetic is often useful for having multiple series plotted on the same chart
    • Can use geom_area() if the goal is to add the series with each sub-total plotted
    • Can use fill= if the goal is to show the proportion of each series by time period
    • Can use geom_ribbon(aes(ymin=0)) to have over-plotted, non-stacked, area charts

Example code includes:

# Plot price vs. carat, colored by clarity
plt_price_vs_carat_by_clarity <- ggplot(diamonds, aes(carat, price, color = clarity))

# Set transparency to 0.5
plt_price_vs_carat_by_clarity + 
    geom_point(alpha = 0.5, shape = 16)

# Plot base
plt_mpg_vs_fcyl_by_fam <- ggplot(mtcars, aes(fcyl, mpg, color = fam))

# Default points are shown for comparison
plt_mpg_vs_fcyl_by_fam + 
    geom_point()

# Now jitter and dodge the point positions
plt_mpg_vs_fcyl_by_fam + 
    geom_point(position = position_jitterdodge(jitter.width=0.3, dodge.width=0.3))

data(iris)

ggplot(iris, aes(Sepal.Length, Sepal.Width, color = Species)) +
    # Swap for jitter layer with width 0.1
    geom_jitter(width=0.1, alpha=0.5)

ggplot(iris, aes(Sepal.Length, Sepal.Width, color = Species)) +
    # Set the position to jitter
    geom_point(position="jitter", alpha = 0.5)

ggplot(iris, aes(Sepal.Length, Sepal.Width, color = Species)) +
    # Use a jitter position function with width 0.1
    geom_point(position=position_jitter(width=0.1), alpha = 0.5)

data(Vocab, package="carData")

# Examine the structure of Vocab
str(Vocab)
## 'data.frame':    30351 obs. of  4 variables:
##  $ year      : num  1974 1974 1974 1974 1974 ...
##  $ sex       : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 2 2 2 1 1 ...
##  $ education : num  14 16 10 10 12 16 17 10 12 11 ...
##  $ vocabulary: num  9 9 9 5 8 8 9 5 3 5 ...
##  - attr(*, "na.action")= 'omit' Named int  1 2 3 4 5 6 7 8 9 10 ...
##   ..- attr(*, "names")= chr  "19720001" "19720002" "19720003" "19720004" ...
# Plot vocabulary vs. education
ggplot(Vocab, aes(x=education, y=vocabulary)) +
    # Add a point layer
    geom_point()

ggplot(Vocab, aes(education, vocabulary)) +
    # Set the shape to 1
    geom_jitter(alpha = 0.2, shape=1)

datacamp_light_blue <- "#51A8C9"

ggplot(mtcars, aes(x=mpg, y=..density..)) +
    # Set the fill color to datacamp_light_blue
    geom_histogram(binwidth = 1, fill=datacamp_light_blue)

ggplot(mtcars, aes(mpg, fill = fam)) +
    # Change the position to identity, with transparency 0.4
    geom_histogram(binwidth = 1, position = "fill")
## Warning: Removed 16 rows containing missing values (geom_bar).

ggplot(mtcars, aes(mpg, fill = fam)) +
    # Change the position to identity, with transparency 0.4
    geom_histogram(binwidth = 1, position = "identity", alpha=0.4)

# Plot fcyl, filled by fam
ggplot(mtcars, aes(x=fcyl, fill=fam)) +
    # Add a bar layer
    geom_bar()

ggplot(mtcars, aes(x=fcyl, fill = fam)) +
    # Set the position to "fill"
    geom_bar(position="fill")

ggplot(mtcars, aes(fcyl, fill = fam)) +
    # Change the position to "dodge"
    geom_bar(position = "dodge")

ggplot(mtcars, aes(cyl, fill = fam)) +
    # Change position to use the functional form, with width 0.2
    geom_bar(position = position_dodge(width=0.2))

ggplot(mtcars, aes(cyl, fill = fam)) +
    # Set the transparency to 0.6
    geom_bar(position = position_dodge(width = 0.2), alpha=0.6)

# Plot education, filled by vocabulary
ggplot(Vocab, aes(x=education, fill = factor(vocabulary))) +
    # Add a bar layer with position "fill"
    geom_bar(position="fill")

# Plot education, filled by vocabulary
ggplot(Vocab, aes(education, fill = factor(vocabulary))) +
    # Add a bar layer with position "fill"
    geom_bar(position = "fill") +
    # Add a brewer fill scale with default palette
    scale_fill_brewer()
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Blues is 9
## Returning the palette you asked for with that many colors

data(economics)

# Print the head of economics
head(economics)
## # A tibble: 6 x 6
##   date         pce    pop psavert uempmed unemploy
##   <date>     <dbl>  <dbl>   <dbl>   <dbl>    <dbl>
## 1 1967-07-01  507. 198712    12.6     4.5     2944
## 2 1967-08-01  510. 198911    12.6     4.7     2945
## 3 1967-09-01  516. 199113    11.9     4.6     2958
## 4 1967-10-01  512. 199311    12.9     4.9     3143
## 5 1967-11-01  517. 199498    12.8     4.7     3066
## 6 1967-12-01  525. 199657    11.8     4.8     3018
# Using economics, plot unemploy vs. date
ggplot(economics, aes(x=date, y=unemploy)) +
    # Make it a line plot
    geom_line()

# Change the y-axis to the proportion of the population that is unemployed
ggplot(economics, aes(x=date, y=unemploy/pop)) +
    geom_line()

load("./RInputFiles/fish.RData")

# Plot the Rainbow Salmon time series
ggplot(fish.species, aes(x = Year, y = Rainbow)) +
    geom_line()

# Plot the Pink Salmon time series
ggplot(fish.species, aes(x = Year, y = Pink)) +
    geom_line()

# Plot multiple time-series by grouping by species
ggplot(fish.tidy, aes(Year, Capture)) +
    geom_line(aes(group = Species))

# Plot multiple time-series by coloring by species
ggplot(fish.tidy, aes(x = Year, y = Capture, color = Species)) +
    geom_line()


Chapter 4 - Themes

Themes from Scratch:

  • Themes are all the non-data ink in a chart, and include text, line, rectangle
    • element_text(), element_line(), element_rect()
  • Can adjust theme elements using calls to theme() in ggplot2
    • myPlot + theme(axis.title=element_text(color=“blue”))
  • Hierarchical naming reflects inheritance rules, so making a change to a grandparent will carry down to its children and grandchildren
  • Can blank out everything using element_blank()
    • myPlot + theme(line=element_blank(), rect=element_blank(), text=element_blank())

Theme Flexibility:

  • Theme objects can be a valuable way to enforce consistency across plots
    • myTheme <- theme(…)
    • myPlot + myTheme # will add myTheme as the last argument of myPlot
    • myPlot + myTheme + theme(…) # anything in theme(…) will override anything in myTheme
  • Can also use a number of built-in themes
    • theme_classic()
    • ggthemes::…
    • theme_tufte()
  • Can take an existing theme and update it to a new theme
    • myNewTheme <- theme_update(…)
    • theme_set(myNewTheme) # will apply the theme across all plots

Effective Explanatory Plots:

  • Often, plots are designed for a lay audience, and are intended to convey a clear and/or dramatic message
    • Intuitive color palettes
    • Redundancy in use of color and scales and text labels
    • Moving x/y labels, legends, titles, etc.
    • Adding and labeling global averages

Example code includes:

recess <- data.frame(begin=as.Date(c('1969-12-01', '1973-11-01', '1980-01-01', '1981-07-01', '1990-07-01', '2001-03-01', '2007-12-01')), 
                     end=as.Date(c('1970-11-01', '1975-03-01', '1980-07-01', '1982-11-01', '1991-03-01', '2001-11-01', '2009-07-30')), 
                     event=c('Fiscal & Monetary\ntightening', '1973 Oil crisis', 'Double dip I', 'Double dip II', 'Oil price shock', 'Dot-com bubble', 'Sub-prime\nmortgage crisis'), 
                     y=c(0.01416, 0.02067, 0.02951, 0.03419, 0.02767, 0.0216, 0.02521)
                     )
recess
##        begin        end                         event       y
## 1 1969-12-01 1970-11-01 Fiscal & Monetary\ntightening 0.01416
## 2 1973-11-01 1975-03-01               1973 Oil crisis 0.02067
## 3 1980-01-01 1980-07-01                  Double dip I 0.02951
## 4 1981-07-01 1982-11-01                 Double dip II 0.03419
## 5 1990-07-01 1991-03-01               Oil price shock 0.02767
## 6 2001-03-01 2001-11-01                Dot-com bubble 0.02160
## 7 2007-12-01 2009-07-30    Sub-prime\nmortgage crisis 0.02521
events <- recess %>%
    select(begin, y) %>%
    rename(date=begin)
events
##         date       y
## 1 1969-12-01 0.01416
## 2 1973-11-01 0.02067
## 3 1980-01-01 0.02951
## 4 1981-07-01 0.03419
## 5 1990-07-01 0.02767
## 6 2001-03-01 0.02160
## 7 2007-12-01 0.02521
# Change the y-axis to the proportion of the population that is unemployed
plt_prop_unemployed_over_time <- ggplot(economics, aes(x=date, y=unemploy/pop)) +
    geom_line(lwd=1.25) + 
    labs(title="The percentage of unemployed Americans\nincreases sharply during recessions") + 
    geom_rect(data=recess, aes(xmin=begin, xmax=end, ymin=0.01, ymax=0.055, fill="red"),
              inherit.aes=FALSE, alpha=0.25
              ) +
    geom_label(data=recess, aes(x=begin, y=y, label=event))



# View the default plot
plt_prop_unemployed_over_time 

# Remove legend entirely
plt_prop_unemployed_over_time +
    theme(legend.position="none")

# Position the legend at the bottom of the plot
plt_prop_unemployed_over_time +
    theme(legend.position="bottom")

# Position the legend inside the plot at (0.6, 0.1)
plt_prop_unemployed_over_time +
    theme(legend.position=c(0.6, 0.1))

plt_prop_unemployed_over_time +
    theme(
        # For all rectangles, set the fill color to grey92
        rect = element_rect(fill = "grey92"),
        # For the legend key, turn off the outline
        legend.key = element_rect(color=NA)
  )

plt_prop_unemployed_over_time +
    theme(
        rect = element_rect(fill = "grey92"),
        legend.key = element_rect(color = NA),
        # Turn off axis ticks
        axis.ticks = element_blank(),
        # Turn off the panel grid
        panel.grid = element_blank()
  )

plt_prop_unemployed_over_time +
    theme(
        rect = element_rect(fill = "grey92"),
        legend.key = element_rect(color = NA),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        # Add major y-axis panel grid lines back
        panel.grid.major.y = element_line(
          # Set the color to white
          color="white",
          # Set the size to 0.5
          size=0.5,
          # Set the line type to dotted
          linetype="dotted"
        ), 
        # Set the axis text color to grey25
        axis.text = element_text(color="grey25"),
        # Set the plot title font face to italic and font size to 16
        plot.title = element_text(size=16, face="italic")
  )

plt_mpg_vs_wt_by_cyl <- ggplot(mtcars, aes(x=wt, y=mpg, color=fcyl)) + 
    geom_point() + 
    labs(x="Weight (1000s of lbs)", y="Miles per Gallon")

# View the original plot
plt_mpg_vs_wt_by_cyl

plt_mpg_vs_wt_by_cyl +
    theme(
        # Set the axis tick length to 2 lines
        axis.ticks.length=unit(2, "lines")
    )

plt_mpg_vs_wt_by_cyl +
    theme(
        # Set the legend key size to 3 centimeters
        legend.key.size = unit(3, "cm")
    )

plt_mpg_vs_wt_by_cyl +
  theme(
    # Set the legend margin to (20, 30, 40, 50) points
    legend.margin = margin(20, 30, 40, 50, "pt")
  )

plt_mpg_vs_wt_by_cyl +
    theme(
        # Set the plot margin to (10, 30, 50, 70) millimeters
        plot.margin=margin(10, 30, 50, 70, "mm")
    )

# Whitespace means all the non-visible margins and spacing in the plot.
# To set a single whitespace value, use unit(x, unit), where x is the amount and unit is the unit of measure.
# Borders require you to set 4 positions, so use margin(top, right, bottom, left, unit)
# To remember the margin order, think TRouBLe
# The default unit is "pt" (points), which scales well with text
# Other options include "cm", "in" (inches) and "lines" (of text)


# Add a black and white theme
plt_prop_unemployed_over_time +
    theme_bw()

# Add a classic theme
plt_prop_unemployed_over_time +
    theme_classic()

# Add a void theme
plt_prop_unemployed_over_time +
    theme_void()

# theme_gray() is the default.
# theme_bw() is useful when you use transparency.
# theme_classic() is more traditional.
# theme_void() removes everything but the data.


# Use the fivethirtyeight theme
plt_prop_unemployed_over_time +
    ggthemes::theme_fivethirtyeight()

# Use Tufte's theme
plt_prop_unemployed_over_time +
    ggthemes::theme_tufte()

# Use the Wall Street Journal theme
plt_prop_unemployed_over_time +
    ggthemes::theme_wsj()

theme_recession <- theme(
  rect = element_rect(fill = "grey92"),
  legend.key = element_rect(color = NA),
  axis.ticks = element_blank(),
  panel.grid = element_blank(),
  panel.grid.major.y = element_line(color = "white", size = 0.5, linetype = "dotted"),
  axis.text = element_text(color = "grey25"),
  plot.title = element_text(face = "italic", size = 16),
  legend.position = c(0.6, 0.1)
)
theme_tufte_recession <- ggthemes::theme_tufte() + theme_recession

themeOld <- theme_get()
theme_set(themeOld)

# Set theme_tufte_recession as the default theme
theme_set(theme_tufte_recession)


plt_prop_unemployed_over_time +
    # Add Tufte's theme
    ggthemes::theme_tufte()

# Draw the plot (without explicitly adding a theme)
plt_prop_unemployed_over_time

plt_prop_unemployed_over_time +
    ggthemes::theme_tufte() +
    # Add individual theme elements
    theme(
        # Turn off the legend
        legend.position = "none",
        # Turn off the axis ticks
        axis.ticks = element_blank()
    )

plt_prop_unemployed_over_time +
    ggthemes::theme_tufte() +
    theme(
        legend.position = "none",
        axis.ticks = element_blank(),
        axis.title = element_text(color = "grey60"),
        axis.text = element_text(color = "grey60"),
        # Set the panel gridlines major y values
        panel.grid.major.y = element_line(
            # Set the color to grey60
            color="grey60",
            # Set the size to 0.25
            size=0.25,
            # Set the linetype to dotted
            linetype="dotted"
        )
    )

theme_set(themeOld)


data(gapminder, package="gapminder")

ctry <- c('Swaziland', 'Mozambique', 'Zambia', 'Sierra Leone', 'Lesotho', 'Angola', 'Zimbabwe', 'Afghanistan', 'Central African Republic', 'Liberia', 'Canada', 'France', 'Israel', 'Sweden', 'Spain', 'Australia', 'Switzerland', 'Iceland', 'Hong Kong, China', 'Japan')

gm2007 <- gapminder %>%
    filter(year==2007, country %in% ctry) %>%
    select(country, lifeExp, continent) %>%
    arrange(lifeExp)
gm2007
## # A tibble: 20 x 3
##    country                  lifeExp continent
##    <fct>                      <dbl> <fct>    
##  1 Swaziland                   39.6 Africa   
##  2 Mozambique                  42.1 Africa   
##  3 Zambia                      42.4 Africa   
##  4 Sierra Leone                42.6 Africa   
##  5 Lesotho                     42.6 Africa   
##  6 Angola                      42.7 Africa   
##  7 Zimbabwe                    43.5 Africa   
##  8 Afghanistan                 43.8 Asia     
##  9 Central African Republic    44.7 Africa   
## 10 Liberia                     45.7 Africa   
## 11 Canada                      80.7 Americas 
## 12 France                      80.7 Europe   
## 13 Israel                      80.7 Asia     
## 14 Sweden                      80.9 Europe   
## 15 Spain                       80.9 Europe   
## 16 Australia                   81.2 Oceania  
## 17 Switzerland                 81.7 Europe   
## 18 Iceland                     81.8 Europe   
## 19 Hong Kong, China            82.2 Asia     
## 20 Japan                       82.6 Asia
# Set the color scale
palette <- RColorBrewer::brewer.pal(5, "RdYlBu")[-(2:4)]

# Add a title and caption
plt_country_vs_lifeExp <- ggplot(gm2007, aes(x = lifeExp, y = fct_reorder(country, lifeExp), color = lifeExp)) +
    geom_point(size = 4) +
    geom_segment(aes(xend = 30, yend = country), size = 2) +
    geom_text(aes(label = round(lifeExp,1)), color = "white", size = 1.5) +
    scale_x_continuous("", expand = c(0,0), limits = c(30,90), position = "top") +
    scale_color_gradientn(colors = palette) +
    labs(title="Highest and lowest life expectancies, 2007", caption="Source: gapminder")
plt_country_vs_lifeExp

# Define the theme
plt_country_vs_lifeExp +
    theme_classic() +
    theme(axis.line.y = element_blank(), axis.ticks.y = element_blank(), axis.text = element_text(color="black"), axis.title = element_blank(), legend.position = "none")

global_mean <- gapminder %>% filter(year==2007) %>% pull(lifeExp) %>% mean()
x_start <- global_mean + 4
y_start <- 5.5
x_end <- global_mean
y_end <- 7.5


# Add text
plt_country_vs_lifeExp +
    theme_classic() +
    theme(axis.line.y = element_blank(), axis.ticks.y = element_blank(), axis.text = element_text(color="black"), axis.title = element_blank(), legend.position = "none") +
    geom_vline(xintercept = global_mean, color = "grey40", linetype = 3) +
    annotate("text", x = x_start, y = y_start, label = "The\nglobal\naverage", vjust = 1, size = 3, color = "grey40")

# Add a curve
plt_country_vs_lifeExp +  
    theme_classic() +
    theme(axis.line.y = element_blank(), axis.ticks.y = element_blank(), axis.text = element_text(color="black"), axis.title = element_blank(), legend.position = "none") +
    geom_vline(xintercept = global_mean, color = "grey40", linetype = 3) +
    annotate("text", x = x_start, y = y_start, label = "The\nglobal\naverage", vjust = 1, size = 3, color = "grey40") + 
    annotate("curve", x = x_start, y = y_start, xend = x_end, yend = y_end, arrow = arrow(length = unit(0.2, "cm"), type = "closed"), color = "grey40")

theme_set(themeOld)

Preparing for Machine Learning Interview Questions in R

Chapter 1 - Data Pre-processing and Visualization

Data Normalization:

  • Data preprocessing includes normalization, visualization, and detecting/managing outliers
  • Data normalization (feature scaling) is an important step in the ML pipeline
    • Not always needed, but frequently valuable (and rarely causes harm)
    • Min-max scaling scales every feature to a (0, 1) scale where min is 0 and max is 1
    • Standardization (z-score normalization) scales features to N(0, 1) by using mu, sd

Handling Missing Data:

  • What to do with missing values is an important topic - general options include ignore/delete (often suboptimal), impute, accept (use methods that handle naturally - uncommon)
  • The naniar package is valuable for analyzing the amount of missing data in a frame
    • miss_var_summary(myDF)
    • any_na(myDF)
    • myDF <- replace_with_na_all(myDF, ~.x==“?”) # replace all ‘?’ with NA
    • vis_miss(myDF, cluster=TRUE) # visualize missing values, clustered by columns with similar missingness
    • gg_miss_case(myDF) # show the missingness by row
    • add_label_shadow(myDF) # create columns with _NA suffixes reporting the boolean for whether the data are missing
  • Three types of missing data - MCAR, MAR, MNAR
    • MCAR - can impute, will not bias findings, random patterns of missingness in meaningless clusters
    • MAR - can impute, may bias findings, well-defined missingness clusters at least on some dimensions
    • MNAR - should not impute
  • May want to assess the quality of a missing value imputation process
    • How do the variable distributions change (ideally, not much)?
  • Can us the simputation library for imputation
    • imp_lm <- … %>% simputation::impute_lm(Y ~ X1 + X2)

Detecting Anomalies in Data:

  • Outlier detection for univariate data has two common heuristics - 3-sigma, or 1.5IQR (Q1 - 1.5IQR or Q3 + 1.5*IQR)
  • Outlier detection for multivariate data has four common heuristics - two are kNN and LOF (local outlier factor)
    • Assumptions are the outliers lie far from their neighbors
    • kNN is the average distance of the k-nearest neighbors while LOF is the number of neighbors inside a pre-defined radius
    • fnn::get.knn(myDF)
    • LOF ~ 1 means similar density as neighbors, LOF < 1 means higher density than neighbors (inlier), LOF > 1 means lower density than neighbors (outlier)
  • There are several approaches for managing outliers
    • Retention - use algorithms that are robust to outliers
    • Imputation - impute to some form of less extreme data (mode imputation, kNN imputation, linear imputation, etc.)
    • Capping - replace with 5th percentile and/or 95th percentile
    • Exclusion - generally not recommended, especially in small datasets or datasets that are non-normal

Example code includes:

# fifa_sample <- read_csv("./RInputFiles/fifa_sample.xls")
# glimpse(fifa_sample)

apps <- read_csv("./RInputFiles/googleplaystore.xls")
## Parsed with column specification:
## cols(
##   App = col_character(),
##   Category = col_character(),
##   Rating = col_double(),
##   Reviews = col_double(),
##   Size = col_character(),
##   Installs = col_character(),
##   Type = col_character(),
##   Price = col_character(),
##   `Content Rating` = col_character(),
##   Genres = col_character(),
##   `Last Updated` = col_character(),
##   `Current Ver` = col_character(),
##   `Android Ver` = col_character()
## )
## Warning: 2 parsing failures.
##   row     col               expected     actual                                file
## 10473 Reviews no trailing characters M          './RInputFiles/googleplaystore.xls'
## 10473 NA      13 columns             12 columns './RInputFiles/googleplaystore.xls'
apps <- apps[-10473, ]
glimpse(apps)
## Observations: 10,840
## Variables: 13
## $ App              <chr> "Photo Editor & Candy Camera & Grid & ScrapBook", ...
## $ Category         <chr> "ART_AND_DESIGN", "ART_AND_DESIGN", "ART_AND_DESIG...
## $ Rating           <dbl> 4.1, 3.9, 4.7, 4.5, 4.3, 4.4, 3.8, 4.1, 4.4, 4.7, ...
## $ Reviews          <dbl> 159, 967, 87510, 215644, 967, 167, 178, 36815, 137...
## $ Size             <chr> "19M", "14M", "8.7M", "25M", "2.8M", "5.6M", "19M"...
## $ Installs         <chr> "10,000+", "500,000+", "5,000,000+", "50,000,000+"...
## $ Type             <chr> "Free", "Free", "Free", "Free", "Free", "Free", "F...
## $ Price            <chr> "0", "0", "0", "0", "0", "0", "0", "0", "0", "0", ...
## $ `Content Rating` <chr> "Everyone", "Everyone", "Everyone", "Teen", "Every...
## $ Genres           <chr> "Art & Design", "Art & Design;Pretend Play", "Art ...
## $ `Last Updated`   <chr> "January 7, 2018", "January 15, 2018", "August 1, ...
## $ `Current Ver`    <chr> "1.0.0", "2.0.0", "1.2.4", "Varies with device", "...
## $ `Android Ver`    <chr> "4.0.3 and up", "4.0.3 and up", "4.0.3 and up", "4...
cars <- read_csv("./RInputFiles/car-fuel-consumption-1.xls")
## Parsed with column specification:
## cols(
##   distance = col_number(),
##   consume = col_number(),
##   speed = col_double(),
##   temp_inside = col_number(),
##   temp_outside = col_double(),
##   specials = col_character(),
##   gas_type = col_character(),
##   AC = col_double(),
##   rain = col_double(),
##   sun = col_double(),
##   `refill liters` = col_number(),
##   `refill gas` = col_character()
## )
glimpse(cars)
## Observations: 388
## Variables: 12
## $ distance        <dbl> 28, 12, 112, 129, 185, 83, 78, 123, 49, 119, 124, 1...
## $ consume         <dbl> 5, 42, 55, 39, 45, 64, 44, 5, 64, 53, 56, 46, 59, 5...
## $ speed           <dbl> 26, 30, 38, 36, 46, 50, 43, 40, 26, 30, 42, 38, 59,...
## $ temp_inside     <dbl> 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 2...
## $ temp_outside    <dbl> 12, 13, 15, 14, 15, 10, 11, 6, 4, 9, 4, 0, 10, 12, ...
## $ specials        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ gas_type        <chr> "E10", "E10", "E10", "E10", "E10", "E10", "E10", "E...
## $ AC              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ rain            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ sun             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ `refill liters` <dbl> 45, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ `refill gas`    <chr> "E10", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
fifa_sample <- tibble::tibble(SP=c(43, 70, 47, 22, 74, 45, 65, 71, 66, 62, 58, 55, 57, 15, 67, 66, 46, 65, 71, 80, 68, 62, 49, 70, 55, 17, 56, 48, 25, 62, 14, 55, 17, 43, 62, 63, 52, 62, 58, 81, 73, 59, 60, 66, 43, 59, 58, 79, 16, 64, 14, 12, 68, 78, 36, 52, 59, 67, 75, 80, 38, 73, 56, 80, 66, 68, 72, 41, 72, 51, 66, 37, 75, 19, 15, 34, 69, 86, 74, 57, 80, 51, 76, 63, 22, 76, 43, 22, 46, 39, 55, 81, 77, 62, 81, 19, 70, 74, 60, 59), 
                              RA=c(190, 12, 353, 669, 2.5, 2.6, 406, 18.6, 5.1, 653, 900, 450, 3.9, 713, 1.9, 4.8, 140, 1.6, 1.3, 38.1, 1.6, 953, 891, 2.2, 357, 149, 3.4, 1.7, 7, 347, 105, 2.4, 1.9, 73, 4.8, 801, 3.8, 1.2, 9.5, 6.8, 2.5, 656, 1.5, 7, 631, 1.9, 125, 6.4, 2.6, 648, 1.3, 1.3, 6.7, 20.8, 3.6, 305, 1, 357, 7.5, 17.1, 140, 1.4, 3, 10.3, 795, 6.5, 2.6, 530, 2.7, 495, 12.8, 850, 1.2, 436, 639, 945, 619, 164, 10.2, 639, 5, 365, 1.2, 350, 63, 11.7, 8.7, 534, 2.5, 413, 225, 15.2, 1.6, 534, 14.7, 119, 6.9, 20, 1.5, 512)
                              )


# Glimpse at the dataset
glimpse(fifa_sample)
## Observations: 100
## Variables: 2
## $ SP <dbl> 43, 70, 47, 22, 74, 45, 65, 71, 66, 62, 58, 55, 57, 15, 67, 66, ...
## $ RA <dbl> 190.0, 12.0, 353.0, 669.0, 2.5, 2.6, 406.0, 18.6, 5.1, 653.0, 90...
# Compute the scale of every feature
(fifa_scales <- sapply(fifa_sample, range))
##      SP  RA
## [1,] 12   1
## [2,] 86 953
# Plot fifa_sample data
ggplot(fifa_sample, aes(x=SP, y=RA)) + 
    geom_point(colour="blue", size=5) + 
    labs(title = "Original data", x="Shot power", y="Release amount (millions EUR)") +
    theme(plot.title = element_text(size=22), text = element_text(size=18)) +
    scale_x_continuous(breaks = round(seq(0, max(fifa_sample$SP), by = 5),1)) 

# Apply max-min and standardization: fifa_normalized
fifa_normalized <- fifa_sample %>% 
    mutate(SP_MaxMin = (SP-min(SP))/(max(SP)-min(SP)), RA_MaxMin = (RA-min(RA))/(max(RA)-min(RA)), 
           SP_ZScore = (SP - mean(SP)) / sd(SP), RA_ZScore = (RA - mean(RA)) / sd(RA)
           )

# Compute the scale of every feature: fifa_normalized_scales
(fifa_normalized_scales <- sapply(fifa_normalized, range))
##      SP  RA SP_MaxMin RA_MaxMin SP_ZScore  RA_ZScore
## [1,] 12   1         0         0 -2.265794 -0.7142706
## [2,] 86 953         1         1  1.556152  2.6029590
# Boxplot of original and normalized distributions
boxplot(fifa_normalized[, c("SP", "RA")], main = 'Original')

boxplot(fifa_normalized[, c("SP_MaxMin", "RA_MaxMin")], main = 'Max-Min')

boxplot(fifa_normalized[, c("SP_ZScore", "RA_ZScore")], main = 'Z-Score')

bands <- tibble::tibble(Blade_pressure=c('20', '20', '30', '30', '30', '28', '30', '28', '60', '32', '30', '40', '30', '25', '20', '?', '?', '?', '?', '?', '30', '30', '25', '30', '25', '20', '30', '25', '30', '35', '28', '30', '22', '20', '35', '?', '30', '28', '31', '34', '32', '?', '30', '30', '24', '20', '35', '25', '25', '34', '16', '20', '28', '25', '30', '35', '46', '50', '25', '30'), 
                        Roughness=c('0.75', '0.75', '?', '0.312', '0.75', '0.438', '0.75', '0.75', '0.75', '1.0', '0.75', '0.75', '1.0', '0.625', '1.0', '1.0', '?', '?', '0.75', '0.75', '0.812', '0.812', '0.812', '1.0', '1.0', '1.0', '1.0', '1.0', '0.75', '0.75', '0.75', '0.75', '0.625', '0.625', '0.75', '0.875', '0.625', '1.0', '1.0', '0.75', '1.0', '0.875', '0.875', '0.812', '0.75', '0.75', '0.812', '0.625', '0.625', '0.5', '0.75', '0.75', '0.75', '0.875', '0.625', '?', '0.75', '0.75', '0.625', '0.875'), 
                        Ink_pct=c('50.5', '54.9', '53.8', '55.6', '57.5', '53.8', '62.5', '62.5', '60.2', '45.5', '48.5', '52.6', '50.0', '59.5', '49.5', '62.5', '62.5', '58.8', '54.9', '56.2', '58.8', '62.5', '58.1', '62.5', '57.5', '57.5', '57.5', '58.8', '58.8', '58.8', '45.0', '43.5', '54.3', '53.2', '58.8', '63.0', '58.1', '58.8', '54.3', '62.5', '58.1', '61.7', '55.6', '55.6', '58.1', '56.2', '58.8', '57.5', '58.8', '61.0', '50.5', '50.5', '58.8', '58.8', '62.5', '55.6', '58.8', '62.5', '52.6', '54.9'), 
                        Ink_temperature=c('17.0', '15.0', '16.0', '16.0', '17.0', '16.8', '16.5', '16.5', '12.0', '16.0', '16.0', '14.0', '15.0', '14.5', '16.0', '15.0', '14.0', '15.5', '16.4', '16.5', '16.0', '15.0', '16.3', '15.8', '14.5', '14.0', '15.0', '15.2', '15.0', '17.0', '16.0', '16.5', '14.1', '14.0', '17.0', '15.4', '15.0', '16.0', '15.0', '15.0', '16.0', '15.4', '16.0', '16.3', '15.8', '16.6', '17.0', '13.0', '14.0', '15.9', '17.0', '16.5', '15.0', '16.5', '18.0', '17.0', '12.0', '16.0', '14.6', '24.5'),
                        Band_type=c('band', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'band', 'noband', 'noband', 'band', 'band', 'band', 'noband', 'band', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'band', 'band', 'band', 'noband', 'noband', 'band', 'band', 'noband', 'noband', 'noband', 'noband', 'band', 'noband', 'band', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'band', 'band', 'noband', 'noband', 'noband', 'noband', 'noband', 'noband', 'band', 'band')
                        )
str(bands)
## Classes 'tbl_df', 'tbl' and 'data.frame':    60 obs. of  5 variables:
##  $ Blade_pressure : chr  "20" "20" "30" "30" ...
##  $ Roughness      : chr  "0.75" "0.75" "?" "0.312" ...
##  $ Ink_pct        : chr  "50.5" "54.9" "53.8" "55.6" ...
##  $ Ink_temperature: chr  "17.0" "15.0" "16.0" "16.0" ...
##  $ Band_type      : chr  "band" "noband" "noband" "noband" ...
# Check for missing values using base R and naniar functions
any(is.na(bands))
## [1] FALSE
naniar::any_na(bands)
## [1] FALSE
# What? No missing values! Take a closer glimpse
glimpse(bands)
## Observations: 60
## Variables: 5
## $ Blade_pressure  <chr> "20", "20", "30", "30", "30", "28", "30", "28", "60...
## $ Roughness       <chr> "0.75", "0.75", "?", "0.312", "0.75", "0.438", "0.7...
## $ Ink_pct         <chr> "50.5", "54.9", "53.8", "55.6", "57.5", "53.8", "62...
## $ Ink_temperature <chr> "17.0", "15.0", "16.0", "16.0", "17.0", "16.8", "16...
## $ Band_type       <chr> "band", "noband", "noband", "noband", "noband", "no...
# Replace ? with NAs: bands
bands <- naniar::replace_with_na_all(bands, ~.x == '?')

# Compute missingness summaries
naniar::miss_var_summary(bands)
## # A tibble: 5 x 3
##   variable        n_miss pct_miss
##   <chr>            <int>    <dbl>
## 1 Blade_pressure       7    11.7 
## 2 Roughness            4     6.67
## 3 Ink_pct              0     0   
## 4 Ink_temperature      0     0   
## 5 Band_type            0     0
# Visualize overall missingness
naniar::vis_miss(bands)

# Visualize overall missingness, clustered
naniar::vis_miss(bands, cluster = TRUE)

# Visualize missingness in each variable
naniar::gg_miss_var(bands)

# Missingness in variables, faceted by Band_type
naniar::gg_miss_var(bands, facet = Band_type)
## Warning: `cols` is now required.
## Please use `cols = c(data)`

# Visualize missingness in cases
naniar::gg_miss_case(bands)

# Impute with the mean
imp_mean <- bands %>%
    naniar::bind_shadow(only_miss = TRUE) %>%
    naniar::add_label_shadow() %>%
    naniar::impute_mean_all()
## Warning in mean.default(x, na.rm = TRUE): argument is not numeric or logical:
## returning NA
## Warning in mean.default(x, na.rm = TRUE): argument is not numeric or logical:
## returning NA

## Warning in mean.default(x, na.rm = TRUE): argument is not numeric or logical:
## returning NA

## Warning in mean.default(x, na.rm = TRUE): argument is not numeric or logical:
## returning NA

## Warning in mean.default(x, na.rm = TRUE): argument is not numeric or logical:
## returning NA

## Warning in mean.default(x, na.rm = TRUE): argument is not numeric or logical:
## returning NA
# Impute with lm
imp_lm <- bands %>%
    naniar::bind_shadow(only_miss = TRUE) %>%
    naniar::add_label_shadow() %>%
    simputation::impute_lm(Blade_pressure ~ Ink_temperature) %>%
    simputation::impute_lm(Roughness ~ Ink_temperature) %>%
    simputation::impute_lm(Ink_pct ~ Ink_temperature)


# Peek at the first few rows of imp_models_long
# head(imp_models_long)

# Visualize post-imputation distributions
# ggplot(imp_models_long, aes(x = imp_model, y = value)) + 
#     geom_violin(aes(fill=imp_model)) +
#     facet_wrap(~variable, scales='free_y')

# Calculate post-imputation distribution stats
# imp_models_long %>% 
#     group_by(imp_model, variable) %>% 
#     summarize(var = var(value), avg = mean(value), 
#     median = median(value)) %>% 
#     arrange(variable)


# Peek at the cars dataset
head(cars)
## # A tibble: 6 x 12
##   distance consume speed temp_inside temp_outside specials gas_type    AC  rain
##      <dbl>   <dbl> <dbl>       <dbl>        <dbl> <chr>    <chr>    <dbl> <dbl>
## 1       28       5    26         215           12 <NA>     E10          0     0
## 2       12      42    30         215           13 <NA>     E10          0     0
## 3      112      55    38         215           15 <NA>     E10          0     0
## 4      129      39    36         215           14 <NA>     E10          0     0
## 5      185      45    46         215           15 <NA>     E10          0     0
## 6       83      64    50         215           10 <NA>     E10          0     0
## # ... with 3 more variables: sun <dbl>, `refill liters` <dbl>, `refill
## #   gas` <chr>
# Boxplot of consume variable distribution
boxplot(cars$consume)

# Five-number summary: consume_quantiles
(consume_quantiles <- quantile(cars$consume))
##   0%  25%  50%  75% 100% 
##    4   41   46   52  122
# Calculate upper threshold: upper_th
upper_th <- consume_quantiles["75%"] + 1.5 * (consume_quantiles["75%"] - consume_quantiles["25%"])

# Print the sorted vector of distinct potential outliers
sort(unique(cars$consume[cars$consume > upper_th]))
##  [1]  69  71  74  79  81  87  99 108 115 122
# Scale data and create scatterplot: cars_scaled
cars_scaled <- cars %>%
    select(distance, consume) %>%
    scale() %>%
    as.data.frame()
plot(distance ~ consume, data = cars_scaled, main = 'Fuel consumption vs. distance')

# Compute KNN score
cars_knn <- FNN::get.knn(data = cars_scaled, k = 7)
cars$knn_score <- rowMeans(cars_knn$nn.dist)

# Print top 5 KNN scores and data point indices: top5_knn
(top5_knn <- order(cars$knn_score, decreasing = TRUE)[1:5])
## [1] 320 107  56  62 190
print(cars$knn_score[top5_knn])
## [1] 4.322676 2.202246 1.927798 1.641515 1.365469
# Plot variables using KNN score as size of points
plot(distance ~ consume, cex = knn_score, data = cars,  pch = 20)

# Scale cars data: cars_scaled
cars_scaled <- cars %>%
    select(distance, consume, knn_score) %>%
    scale() %>%
    as.data.frame()

# Add lof_score column to cars
cars$lof_score <- dbscan::lof(cars_scaled, k = 7)

# Print top 5 LOF scores and data point indices: top5_lof
(top5_lof <- order(cars$lof_score, decreasing = TRUE)[1:5])
## [1] 165 287 186 320  80
print(cars$lof_score[top5_lof])
## [1] 4.172873 3.473775 3.352448 3.113629 3.058361
# Plot variables using LOF score as size of points
plot(distance ~ consume, cex = lof_score, data = cars, pch = 20)


Chapter 2 - Supervised Learning

Interpretable Models:

  • Intepretability is the degree to which a human can understand the reason that a model has made a prediction
    • The explanation is the answer to a “why” question
    • Satisfies human learning and curiosity, improves safety and security, enables bias detection (ensures legal/regulatory compliance), increasing trust and social acceptance
  • Several ML algorithms produce interpretable models - linear regression, logistic regression, decision trees, decision rules, naïve Bayes, k-nearest neighbors (kNN)
    • In interpreting a regression, the coefficient should be explained using “all else equal”

Regularization:

  • Regularization attempts to constrain (shrink) the regression coefficients closer to 0, minimizing over-fitting
    • L1 - lasso (least absolute shrinkage and selection operator, penalized by sum of absolute value of coefficients times lambda)
    • L2 - ridge (penalizes by sum-squared of coefficients times lambda - note that lambda=0 is standard regression)
    • Elastic net - mix of L1/L2 (lambda1=0 is ridge, lambda2=0 is lasso, both=0 is OLS)
  • Regularization minimizes over-fits and improves generalization
    • Requires tuning of the lambda coefficient
    • L1 (lasso) drives some/many coefficients to zero, which has a feature-selection component that makes for easier interpretation and communication

Bias and Variance:

  • Bias and variance are two sources of error in machine learning
    • Bias is error from incorrect model assumptions (under-fitting)
    • Variance is error from over-sensitivity to small differences in the training data (over-fitting)
  • To reduce model bias, add terms (complexity) or engage in feature engineering or reduce regularization
  • To reduce variance, add more training data (simplest and most robus solution), add regularization, perform feature selection, decrease model size (fewer terms or fewer neurons or etc.)

Building Ensemble Models:

  • Three common strategies for ensembling are bagging, boosting, and stacking
  • Bagging is bootstrap aggregation - the base learner is trained on each bootstrap, then the learners are combined to make a prediction (e.g., random forest)
  • Boosting constructs base learners in an incremental manner - the better the learner performs, the higher its weight (adaboost, gbm)
    • Training samples also receive weights, with worse predictions leading to a higher weight
    • Subsequent learners help learn from the mistakes of previous learners
  • Stacking finds linear combinations of underlying learners (built on the learners not on the raw data)
  • Need to select base learners for an ensemble
    • Diversity of base learners can be beneficial - different families, different samples of data/attributes
    • Low correlation in predictions of the base learners
    • Base learners that are robust to outliers and missing data
    • Base learners that run in a reasonable amount of time
  • Can use the caretEnsemble library for building ensemble models
    • caretList() will build a list of train() models for ensembling
    • caretEnsemble() ensembles the caretList() using linear regression
    • caretStack() ensembles using a meta-learner
    • The caret:resamples() function calculates performance metrics across all learners
    • modelCor() gives the correlation among the base learners’ predictions

Example code includes:

car <- cars %>%
    select(distance, consume, speed, temp_outside, gas_type, AC) %>%
    mutate(gas_type=factor(gas_type), AC=factor(AC, labels=c("Off", "On")))
test_instance <- tibble::tibble(distance=12.4, consume=5.1, speed=45, temp_outside=5, 
                                gas_type=factor("E10", levels=c("E10", "SP98")), 
                                AC=factor("Off", levels=c("Off", "On"))
                                )
test_instance
## # A tibble: 1 x 6
##   distance consume speed temp_outside gas_type AC   
##      <dbl>   <dbl> <dbl>        <dbl> <fct>    <fct>
## 1     12.4     5.1    45            5 E10      Off
# Glimpse on the car dataset
glimpse(car)
## Observations: 388
## Variables: 6
## $ distance     <dbl> 28, 12, 112, 129, 185, 83, 78, 123, 49, 119, 124, 118,...
## $ consume      <dbl> 5, 42, 55, 39, 45, 64, 44, 5, 64, 53, 56, 46, 59, 51, ...
## $ speed        <dbl> 26, 30, 38, 36, 46, 50, 43, 40, 26, 30, 42, 38, 59, 58...
## $ temp_outside <dbl> 12, 13, 15, 14, 15, 10, 11, 6, 4, 9, 4, 0, 10, 12, 11,...
## $ gas_type     <fct> E10, E10, E10, E10, E10, E10, E10, E10, E10, E10, E10,...
## $ AC           <fct> Off, Off, Off, Off, Off, Off, Off, Off, Off, Off, Off,...
# Build a multivariate regression model: car_lr
car_lr <- lm(consume ~ ., data = car)

# Summarize the model and display its coefficients
summary(car_lr)$coef
##                  Estimate  Std. Error     t value     Pr(>|t|)
## (Intercept)  56.883105605 3.484721565 16.32357264 8.066425e-46
## distance      0.007590493 0.004953192  1.53244479 1.262404e-01
## speed        -0.193808735 0.077206153 -2.51027578 1.247618e-02
## temp_outside -0.587521978 0.128868107 -4.55909529 6.927263e-06
## gas_typeSP98  0.443869591 1.818572662  0.24407581 8.073031e-01
## ACOn         -0.098094250 3.344675402 -0.02932848 9.766179e-01
# Predict with linear regression model
predict(car_lr, test_instance)
##        1 
## 45.31822
# Build a regression tree: car_dt
car_dt <- rpart::rpart(consume ~ ., data = car)

# Fancy tree plot
rattle::fancyRpartPlot(car_dt)

# Extract rules from the tree
rpart.plot::rpart.rules(car_dt)
##  consume                                                                           
##       32 when speed >=       22 & temp_outside >=       12 & distance <   29       
##       36 when speed >=       25 & temp_outside <  12       & distance <   47       
##       38 when speed >=       25 & temp_outside is  4 to 12 & distance >=        167
##       40 when speed >=       22 & temp_outside >=       12 & distance >=         29
##       43 when speed >=       25 & temp_outside <  12       & distance is  52 to 122
##       49 when speed >=       25 & temp_outside <   4       & distance >=        167
##       51 when speed >=       25 & temp_outside <  12       & distance is 122 to 167
##       61 when speed is 22 to 25 & temp_outside <  12                               
##       68 when speed >=       25 & temp_outside <  12       & distance is  47 to  52
##       75 when speed <  22
# Predict test instance with decision tree
predict(car_dt, test_instance)
##        1 
## 36.41667
fifaRaw <- c(0.569, -1.555, -0.068, -0.705, -1.342, -0.28, -0.068, -1.98, -0.28, 0.357, -1.767, 0.357, -0.28, -0.918, -0.068, -0.068, 2.693, -0.918, 0.357, -0.068, -0.918, 0.144, -0.493, -1.767, -0.28, 1.419, 0.357, -0.28, 1.844, -0.493, -1.342, -1.342, -0.28, -1.13, 1.207, 2.269, -0.28, 1.419, 0.357, -0.068, -0.918, -0.918, 0.569, -0.28, 0.144, -0.28, -0.705, -0.28, 2.693, -1.342, 0.782, 0.357, -1.555, 0.994, 1.207, 0.994, 0.357, -0.918, -0.28, -0.493, -0.705, 0.144, -0.068, 1.207, -1.13, -0.918, -0.918, -0.068, -1.555, -0.068, 1.207, 0.357, -1.342, 0.569, -0.493, 1.631, -0.918, 0.994, 1.207, -0.068, 0.357, -0.705, 0.782, -0.068, 2.481, 1.631, -0.918, -0.918, -1.767, 1.631, 0.782, -0.28, 0.569, -0.705, 0.144, 2.481, -1.342, 0.782, 0.144, 0.144, -1.342, -0.493, -0.918, -1.555, 0.357, -1.555, -0.493, -0.493, -0.493, 0.569, -1.342, -0.28, 1.207, -1.555, -1.342, -0.068, 1.419, -1.555, 0.569, -0.493, 1.207, 0.144, -0.705, -0.493, 2.056, -1.342, 1.207, -1.13, 0.782, -0.068, -0.493, -1.13, -0.068, -0.493, -0.068, 0.994, -1.13, 0.357, -1.13, -0.068, -1.13, -0.705, -0.068, -0.28, -0.705, 0.782, 1.207, 1.631, 0.994, -1.13, -1.13, 0.994, 0.569, 0.994, 0.782, 0.144, 0.144, 0.144, -1.13, -0.068, -0.705, 0.144, 0.357, 0.782, 0.782, -0.705, -0.28, -0.705, 1.631, 2.056, 1.631, -0.068, -1.13, 0.144, -0.28, 1.207, -0.493, 1.419, -1.555, 0.782, -1.13, -1.13, -1.13, -0.068, 0.782, -0.918, 0.144, 0.569, -0.068, -0.493, 0.994, -0.918, -1.342, -0.493, -0.918, 0.569, -0.28, -0.705, 0.144, -1.342, 0.994, -1.555, 1.207, -0.28, 2.056, -1.13, -0.918, 1.631, 0.357, -0.705, -0.493, 2.056, -0.493, 1.844, 0.782, -0.068, 0.144, -0.705, -0.918, -0.28, 0.782, -1.555, -0.918, 2.056, 0.994, -0.493, -0.493, 0.144, -1.342, -1.342, 0.782, 0.994, 0.144, -0.28, 0.144, -0.493, -0.493, -0.918, 0.782, -1.342, 0.144, 0.144, -0.28, 1.419, -0.068, -0.068, 0.782, -1.767, -0.918, -0.068, -0.493, 0.357, -1.13, -0.28, 0.357, -1.13, -0.28, 0.994, -1.555, 1.207, -0.28, -0.705, -1.13, 0.144, -0.918, -0.705, 0.994, 0.357, 0.357, -0.705, -0.493, -0.493, 0.357, -0.28, -0.493, 0.569, -1.342, -0.068, 0.569, 2.269, -0.493, 1.419, 1.631, -1.13, 0.144, 2.056, -0.068, 0.994, 1.631, -0.068, 1.419, 2.481, 1.207, -0.28, 2.056, -1.555, -0.918, 0.782, 0.782, -0.493, 1.631, -0.28, -1.342, -0.918, -1.555, -0.068, 0.569, 1.207, -0.068, 0.569, -0.705, -0.28, -1.342, 1.631, 0.357, -0.068, -1.555, 0.357, 0.782, -0.705, -1.13, -0.493, -0.28, -0.918, 0.994, -0.493, -1.342, 0.357, 0.782, -0.918, -0.28, 1.419, 0.144, -1.13, -0.28, -0.28, 0.569, -1.342, -0.918, -1.342, 1.844, -0.068, -0.28, -0.068, 0.144, -0.28, -1.767, -0.28, -0.705, 0.144, -1.13, -1.13, -0.705, 0.569, -0.068, -1.342, 0.357, 0.144, 2.056, -0.705, 1.631, 0.782, -1.342, 1.419, -0.28, -0.28, 0.144, 1.419, 1.631, 0.569, -0.705, 2.056, -1.767, -0.918, -0.28, -1.555, -0.068, -0.068, 0.144, -1.555, -0.493, -0.068, -0.068, -0.705, -0.28, -1.13, -0.068, 1.419, 2.056, -0.493, -0.918, -0.705, -0.918, 1.419, -0.493, -0.28, 0.144, -0.493, -0.918, -1.342, -0.28, 0.357, -1.13, 1.631, 0.782, 0.357, 0.994, 0.782, -0.068, -0.918, -0.28, 1.844, -0.28, 0.782, 0.357, -1.13, -1.13, 0.569, 0.569, -0.068, 0.782, 0.357, 0.782, 0.144, 1.844, 1.207, 0.144, 0.357, 0.357, -1.13, -1.767, -0.068, 2.056, -1.342, 1.631, -0.068, 1.631, -0.068, 0.357, 1.419, 0.782, 0.569, -0.918, 0.569, -0.918, 1.419, -0.28, 0.569, 0.994, 0.357, 0.782, 0.357, 1.207, 0.782, -0.918, -0.493, -0.28, -1.98, 0.994, -1.342, -1.342, 0.357, 0.144, -0.493, -0.068, -1.342, -0.705, -0.918, 0.357, -1.555, 0.357, 1.419, 0.357, -0.068, 1.419, 1.631, 1.419, 1.419, 0.144, -0.493, 0.569, 1.844, 0.569, -1.13, -0.28, 1.631, 1.844, 1.207, -0.705, -1.555, -1.342, -0.705, -0.705, 2.056, 1.419, -0.918, -0.493, 0.994, -0.705, 0.782, -0.198, 1.373, -0.023, -0.198, -0.023, 2.246, -0.198, 0.152, -0.547, -0.547, 0.675, -0.721, -0.896, 0.501, 1.199, -1.943, -0.023, 2.595, -0.372, -2.467, -0.198, -1.245, -0.372, -0.023, -0.023)
fifaRaw <- c(fifaRaw, -1.419, -0.896, -0.372, -0.198, 1.548, -1.245, -0.198, -0.547, 1.024, 1.548, -0.721, -0.721, 0.152, -0.198, 0.501, 1.722, 0.85, -0.547, -0.198, 0.326, 1.024, 0.85, 0.326, -0.198, -0.023, 1.199, -0.721, 1.548, 0.501, -1.07, 0.152, 0.675, -0.198, -1.594, 0.501, -0.198, 0.85, -0.198, -0.198, -0.023, 1.024, -0.023, -1.07, 0.85, -0.547, -1.594, -1.07, 0.675, 3.293, 0.326, -0.023, 1.373, -1.245, -0.372, 1.373, 0.326, 0.501, -0.547, -0.896, 0.675, -0.721, -1.245, -0.547, 0.85, 0.326, 0.501, -0.372, -0.372, 0.675, -0.372, 0.152, 0.501, 0.675, -0.547, -0.896, 0.85, 0.85, 1.199, -0.547, -0.547, 0.501, -1.07, -0.023, 0.326, 0.326, 0.501, 0.501, 1.024, 0.152, 0.675, 1.024, -1.07, -0.198, -1.07, 0.501, -1.245, -0.198, 1.024, 1.548, -0.547, -1.07, 0.326, 1.722, 0.501, -0.721, -0.547, -0.198, -0.547, -0.896, -0.198, -1.245, -0.198, -1.594, -1.768, -0.023, 1.897, 0.152, 2.421, 1.199, -1.07, -0.023, -0.023, 0.85, -0.023, 0.675, -0.198, -0.896, -0.547, -0.023, 0.675, -0.372, -0.896, 0.326, 1.024, 0.85, 0.501, -0.198, -1.245, -0.547, -0.372, 0.85, -0.896, 2.071, -0.547, -0.198, -1.245, 0.326, 0.326, -0.721, -0.198, -1.245, 0.152, -0.896, 0.501, -1.07, -0.372, -2.292, 0.326, 0.501, -1.07, 1.199, 0.152, -1.07, -1.943, -1.245, -0.198, 1.199, -1.768, 0.326, -1.245, -0.721, 0.326, 0.326, 0.85, 1.548, -0.023, -1.245, -1.07, -0.023, -0.547, 0.326, 1.024, -0.198, 0.326, 1.199, 0.85, -0.023, 1.199, -1.419, 1.199, -0.896, -0.023, 0.501, 0.152, -2.118, -0.198, -0.023, -0.023, -1.419, 1.024, 0.85, 0.501, -1.07, 0.85, -0.198, -1.245, -0.721, -0.721, 1.897, 0.326, 0.152, 1.024, -0.198, 0.326, 0.501, -0.198, -1.07, 1.199, -1.245, 0.675, -0.721, -0.547, 0.152, -1.07, -1.245, 0.85, -0.023, 1.373, 1.024, -0.547, -0.372, 1.024, -0.547, -0.372, -2.118, -1.768, -1.07, 1.024, -2.816, 1.024, 0.501, -0.372, -0.372, 1.897, 0.152, -0.721, 2.944, 1.024, 1.199, -0.721, -1.594, -0.896, -0.372, 0.85, -1.245, -0.372, 0.501, -0.547, 2.071, 1.548, -0.547, 1.548, -0.023, 1.024, -0.721, -0.198, 0.675, 0.85, -1.245, -1.594, 0.85, -1.419, 0.675, 0.675, -0.721, -1.768, 3.293, 0.326, 0.326, 1.024, -0.547, 0.326, 2.246, -0.198, -0.547, 1.373, 0.152, 1.199, 1.024, 0.152, -1.419, -0.372, -0.023, -0.372, -0.372, -0.023, 1.373, 1.024, 0.326, 0.675, 2.246, -0.547, 1.548, -0.372, -0.547, -0.023, -0.198, -1.07, 0.326)
fifaRaw <- c(fifaRaw, 0.675, 0.152, -1.419, 1.373, -0.023, 0.152, -0.547, 1.373, -0.023, 0.501, -0.198, -0.023, 0.501, 0.152, 1.199, -1.07, 2.421, -1.07, 1.024, 0.326, -0.198, -1.245, -1.245, -0.198, -0.372, -2.467, 0.501, 0.152, -0.023, -0.198, 0.326, -0.896, 0.501, 1.897, -0.547, -0.721, 0.501, -0.372, 1.373, 0.675, 0.326, 0.152, -1.07, 0.675, -0.198, -0.198, 0.152, -0.721, -0.198, -0.372, -0.023, -0.896, 0.675, -0.198, 0.675, 1.373, -1.594, 1.373, 0.326, -1.768, 1.722, 1.024, 0.326, 1.548, 1.722, -1.245, -2.292, 0.675, -0.547, -1.07, -0.547, -0.721, -1.419, 0.85, 0.501, 0.501, 0.501, -1.943, 1.199, -0.896, 0.152, 2.071, 0.152, -0.372, -0.721, 2.071, -0.896, -1.245, -1.07, -0.198, -2.292, -1.245, -0.023, -0.547, 0.501, -0.547, 0.85, -0.372, -0.372, -0.547, -0.023, -0.547, 0.675, -0.023, 0.85, 0.326, -0.198, -1.07, 0.152, 0.326, -0.372, -1.419, -1.245, 0.501, -0.372, -1.07, -0.896, -0.198, -0.023, 1.199, 1.373, -0.896, -0.896, 0.675, 0.326, -1.594, -0.372, -0.721, -0.547, 1.722, -0.547, -1.245, 1.722, 0.501, 0.326, 1.548, 0.675, -1.768, -0.372, -1.245, -1.245, -0.721, -1.245, 3.468, 0.675, -0.023, 0.152, -1.768, -0.198, -0.198, -1.594, 0.675, 0.501, -1.245, 0.85, 1.548, -0.023, 1.897, 0.675, -1.419, 0.85, -1.419, 1.199, 1.199, -0.372, -1.594, 0.618, 0.095, 0.409, -1.841, 0.042, -1.789, -0.586, -0.167, 0.775, 0.566, -0.167, 0.775, 0.461, 0.147, 1.351, -1.056, 1.508, 0.827, 0.775, 0.513, -1.475, 0.461, -1.789, -0.115, 0.775, -1.894, 0.88, 0.88, 1.194, 0.461, -0.69, 0.356, 0.461, 0.513, -1.632, 0.618, 0.566, 0.566, -0.586, 0.513, -1.004, 0.461, -1.737, 1.141, 1.194, 0.984, 0.042, 0.409, -1.946, -1.318, 1.455, 0.618, -0.01, -1.841, -0.481, -1.841, 1.351, -1.423, -1.946, 0.723, 0.461, 0.618, 0.513, 0.775, 0.304, 0.147, 0.618, -1.004, 0.827, 0.775, -0.219, 0.618, 0.67, 2.031, 0.042, -2.051, 0.409, 1.351, 0.618, -0.376, -0.324, -1.109, 1.037, -1.475, -1.998, 0.513, -0.272, 0.252, -0.952, 0.095)
fifaRaw <- c(fifaRaw, -0.062, 0.199, 0.775, -0.01, 0.984, 0.409, -0.899, 1.194, 0.67, -0.324, -0.743, -0.115, 0.827, -1.318, 1.089, -1.423, 0.252, 0.566, -0.847, 1.665, -1.894, -1.318, 1.037, -1.998, -0.952, 1.298, 0.67, 0.147, 0.042, -0.272, 0.88, 1.246, -1.946, 1.141, -1.946, -1.318, -0.272, 1.298, 0.042, -1.58, 0.566, -1.109, 0.356, 0.513, 0.095, 0.147, 0.566, -1.527, -0.062, -1.266, 0.67, -0.167, 1.56, 0.67, 0.304, 0.984, 1.194, -1.946, 1.141, 0.199, -0.272, 0.932, 0.409, -0.167, 1.037, 0.827, 0.67, 0.984, -0.01, -1.37, 0.67, 0.566, 0.723, 0.67, -2.051, 0.67, -0.899, -1.841, -1.737, -1.946, 0.566, 0.618, 0.409, -1.737, 0.252, 0.409, 0.095, 0.566, -0.429, 0.252, 0.461, -2.051, 0.252, 0.67, 0.775, -0.586, 0.566, 0.199, -0.01, 0.304, -0.376, -0.167, -1.894, -1.841, -1.056, 1.037, 0.199, 0.513, 0.618, -2.051, -1.894, -0.115, 0.932, 0.984, -0.272, 0.042, -0.062, 1.351, 1.351, 1.612, 0.827, -0.795, 1.089, 0.723, 1.298, 0.566, 1.141, -1.056, 0.304, 0.095, 0.88, -0.69, 0.356, 0.775, 1.403, -0.743, 0.566, 0.67, 0.042, -1.894, 0.513, 0.618, -0.219, 0.461, -1.841, 0.775, 0.827, -2.051, 0.147, 0.409, -1.789, 0.252, 0.827, 0.199, 0.409, 0.356, -0.952, -1.109, -0.586, 0.618, 0.984, -0.115, -0.69, 0.513, -0.899, 0.461, -0.638, 0.932, -1.946, -0.167, 0.304, -0.899, -1.998, 0.461, -0.219, 0.618, 0.304, 1.141, 0.618, 0.984, -1.056, -0.115, 0.409, -2.051, 0.409, 0.304, -0.533, -0.219, -0.376, -1.998, 0.356, 0.513, -2.155, -0.533, 0.042, 1.194, 1.246, 0.932, 0.827, -1.789, 0.775, 0.618, 1.455, -0.219, -1.841, -1.998, 0.199, 0.513, 0.095, 0.461, -1.632, 1.508, 0.461, -1.998, 1.141, -1.37, 1.298, 1.978, 0.723, 0.984, 0.723, 0.042, -0.795, 1.508, -0.324, 0.409, -1.004, 0.984, 1.194, -0.481, 0.775, -0.481, -1.998, 0.932, 0.356, 1.455, -0.533, 0.095, -0.324, -1.423, -0.533, -1.737, 0.461, 0.147, -1.998, 0.775, 0.566, -1.109, -0.01, -1.946, 0.042, -0.952, 0.618, 1.351, -0.69, 0.304, -1.056, -0.167, 1.403, 0.513, 1.298, -0.638, 0.304, 1.298, -1.737, -0.69, 1.037, -1.161, -1.789, -0.899, 1.403, -0.272, 0.252, 0.356, 0.88, -0.272, 0.827, 0.147, 0.461, -0.795, 0.932, -1.841, -0.69, -1.946, 0.618, 0.199, 0.775, 0.67, 0.775, -0.481, -0.01, -1.004, 0.409, -0.272, 0.199, -0.062, -1.841, 1.717, -1.266, 0.566, 0.304, 0.042, 0.095, -2.208, 0.566, 0.199, 0.827, 0.618, 0.513, -0.272, -1.58, 0.67, 0.304, -0.115, 1.298, 0.618, 0.461, 1.298, -1.737, 0.513, 1.141, 0.67, 0.88, 0.356, -0.69, 0.147, -1.004, 0.827, 0.461, 1.926, 0.461, 0.67, 1.194, -0.69, -1.841, 0.775, 0.775, 0.88, 1.141, 0.042, -1.894, -0.743, -1.998, -0.376, 0.67, -0.743, 0.827, -1.004, 1.194, -0.01, 1.141, 1.037, 0.304, -0.167, -0.847, 0.513, 0.461, 1.194, -1.894, 0.304, 0.042, 0.984, -0.219, -1.946, -1.632, -1.266, 0.775, 0.147, 0.618, -0.638, -1.161, 0.88, 0.461)
fifaRaw <- c(fifaRaw, 0.147, 0.095, -0.272, 0.88, 0.566, -1.998, -0.429, 1.351, 0.304, 0.723, -1.213, -1.475, -1.789, -1.737, 0.88, 0.775, -1.109, 0.199, 0.513, 0.461, 0.618, -0.115, 0.88, 1.351, 0.199, -0.638, -0.69, 0.252, 0.409, 0.618, -2.051, -0.01, -0.481, 0.513, 1.612, 0.67, -1.318, 1.232, 0.475, 1.081, -1.242, -0.636, -1.646, -0.283, 0.576, 0.475, 0.02, -0.485, 1.182, 0.525, 0.273, 1.182, 0.879, 1.131, 1.485, 0.879, -0.182, -1.444, -0.03, -1.646, -1.091, -1.04, -1.444, 0.778, -0.485, 0.273, 0.323, -1.444, -0.586, -0.586, -0.535, -1.697, 1.081, 0.576, -1.242, -0.737, 0.424, -0.434, -0.283, -1.798, 1.03, 0.02, 1.434, 1.333, 0.374, -1.697, -0.081, 1.384, 0.424, 0.576, -1.646, 0.677, -1.495, -0.182, -0.788, -2, 1.131, 0.828, -0.182, 1.03, -0.384, 0.626, 0.828, 0.172, -1.192, 0.172, 1.182, 0.828, 1.03, -0.03, 1.586, 0.02, -1.646, 1.03, 0.576, -0.788, 1.535, -0.737, -1.04, -0.081, 0.929, -1.697, 1.131, -1.444, -1.242, -0.99, -0.081, -0.737, 0.879, -0.485, 0.576, 1.182, 1.535, -0.889, 1.182, -0.737, 1.081, 1.081, 0.778, -0.384, -1.343, -0.737, -1.04, -0.687, 0.626, -0.232, 0.626, -1.848, -0.434, 0.071, -1.848, -1.394, 0.727, 0.475, 0.525, 1.131, 1.283, 0.02, 0.929, -1.293, 0.929, -1.646, -1.04, -0.485, 0.879, -0.434, -1.293, 0.677, -0.939, -0.384, 0.576, 1.182, 0.071, 0.677, -1.697, -0.788, -1.394, 0.98, -0.081, 1.333, 1.081, 0.626, 0.02, 0.778, -1.697, 1.03, 0.374, 0.929)
fifaRaw <- c(fifaRaw, 0.929, -0.838, -0.636, 0.475, 0.98, 0.879, 0.424, 0.778, -1.394, 0.525, 1.434, -0.182, -1.04, -1.444, 0.929, 1.232, -1.495, -1.495, -1.646, 0.424, 1.232, -0.636, -1.747, -0.586, 0.576, 1.182, 0.677, 0.323, -0.131, 0.172, -1.899, 0.929, -0.485, 0.626, 1.232, 0.626, -0.535, -0.384, -0.232, -1.596, 0.98, -2, -1.343, -1.04, 0.475, 1.182, -0.737, 0.727, -2, -1.697, 0.071, 0.626, -0.687, 1.131, -0.485, 0.98, 0.727, 1.283, 1.232, 1.081, -0.485, 0.424, 0.273, 1.687, 0.121, 0.273, -1.192, -0.737, -1.192, 0.677, -0.788, 0.172, -0.283, 0.475, -1.343, 0.071, 0.98, -0.131, -1.798, -0.485, -0.03, 0.929, -0.03, -1.242, 0.677, 1.283, -1.848, -1.091, 0.576, -1.545, 0.374, 1.586, 0.626, 0.424, 0.778, -0.838, 0.727, -0.99, 0.475, 1.081, 0.374, -0.838, 0.323, -0.99, 0.525, -1.091, 0.879, -1.899, 0.374, 0.172, -1.141, -1.848, -0.636, -0.687, -0.636, 1.434, -0.384, 0.02, 0.727, 0.727, -0.384, 1.636, -1.747, 0.677, -0.636, 0.525, 0.677, 1.384, -1.596, -0.384, 1.131, -1.747, 1.434, 1.838, 1.182, -0.636, 0.576, 1.788, -1.848, 0.475, 0.525, 0.778, 0.677, -1.545, -1.899, 0.626, 0.323, 1.03, 0.778, -1.596, 1.788, -0.384, -1.899, 0.02, -1.242, 1.081, 1.687, -1.293, -0.586, 1.232, -0.636, -0.889, 1.081, 0.475, 0.626, 0.576, 0.172, 0.778, 0.677, -0.939, 1.131, -1.545, 0.525, -0.889, 1.485, -0.636, -0.131, 1.182, -1.141, 1.182, -1.394, 0.879, 1.03, -1.747, 0.02, -1.192, -1.192, 0.475, -1.899, -0.03, -0.535, 1.03, 0.98, -0.384, 1.384, -1.394, -0.737, 0.374, 0.071, 0.828, 0.172, 0.222, 0.98, -1.394, -0.182, -0.081, -0.687, -1.646, 0.576, 0.727, 0.121, -0.485, -0.687, 0.98, -0.03, 1.333, -0.636, 0.424, 1.131, 0.879, -1.646, 1.081, -1.697, 0.626, -1.141, 0.525, -0.434, 1.333, 0.323, 1.03, -0.939, -0.232, 0.677, 0.323, -0.636, -1.444, 1.131, -0.434, 1.586, -0.838, -1.141, -0.485, -1.242, 0.02, -0.03, 0.374, -0.838, 0.98, -1.141, -1.394, 1.485, -0.232, 0.222, 0.121, 0.424, 0.677, -0.081, -1.848, -0.99, 0.273, 0.525, 0.727, 0.828, -0.131, -0.939, 0.778, 0.727, 1.182, 0.727, 0.626, 0.677, 0.879, 1.232, -1.596, -0.131, 1.232, 0.626, 1.384, 0.172, -1.848, -0.03, -1.848, 0.222, 0.778, 1.182, 0.828, -1.04, 0.525, -0.283, -0.485, 0.071, -0.737, 0.727, -1.242, -0.283, 0.576, 1.081, -1.747, 0.071, 1.182, 0.071, 1.333, -1.495, -1.242, 0.98, -1.04, -0.283, 1.434, -0.737, 0.626, 0.879, 0.172, 1.434, -0.636, -0.434, 1.182, 0.475, -1.495, 1.03, 0.576, 0.222, 0.475, -0.788, -1.242, -1.242, -1.545, 0.778, 1.081, -1.242, 1.384, -0.939, 0.626, 0.828, -0.434, 1.687, 1.384, 0.222, -0.283, -1.747, -0.182, 1.535, 0.778, -2.05, 0.677, -0.636, 0.273, 1.535, 0.071, -1.394, 0.467, 0.189, 0.801, -1.867, -0.033, -2.2, 0.69, -0.366, 0.301, -0.366, -0.644, 0.189, -0.144, -0.255, -0.533, 0.301, 0.467, 0.245, 0.856, -0.7, -1.867, -0.311, -1.756, -0.589, -0.033, -2.089, 0.023, 0.467, 0.634, -0.2, 0.078, -0.311, 0.356, -0.311, -1.811, 0.578, -1.033)
fifaRaw <- c(fifaRaw, 0.578, 1.19, 0.356, 1.023, 0.078, -2.033, -0.811, 0.189, -0.589, 1.301, 0.189, -2.256, -1.033, -0.144, -0.533, -0.644, -2.145, 1.301, -1.589, 0.301, 0.69, -1.2, 0.245, -0.644, 1.19, 1.023, 0.134, -0.144, -0.088, -0.866, 0.301, -0.255, 0.467, 0.467, 0.634, -0.644, 0.134, 0.578, -2.089, -0.255, -0.477, 0.856, 1.523, 1.412, 0.634, -0.366, 0.69, -2.2, 0.301, -0.2, -0.033, 0.801, 1.245, 1.19, 0.801, -0.033, -0.533, -0.033, 1.856, 0.356, -0.644, 0.412, 0.467, 0.801, -0.477, 0.578, -0.033, 0.134, 0.523, 0.023, -0.533, 0.856, 1.023, -2.2, 0.967, 0.023, -2.2, 0.301, 0.189, 0.912, -0.255, 0.245, 0.467, 0.523, 0.023, -1.756, -0.088, -2.256, -0.477, 1.301, 0.412, 1.134, -2.145, -0.255, 0.189, 0.301, -1.644, 1.19, 0.523, -0.033, -2.2, 0.078, 1.078, 0.412, 0.523, -0.422, 0.856, -1.089, 0.912, 0.356, -2.145, 1.245, -0.589, 0.745, 0.467, 0.745, 1.078, 0.245, 0.578, -0.2, 1.245, 0.634, 0.801, 0.856, 0.301, 0.69, 0.801, -1.756, -0.033, 1.412, -2.089, -2.089, -2.145, 0.023, -0.255, -0.255, -2.256, -0.755, -0.2, 0.245, 0.134, -0.255, 0.523, -0.644, -2.256, -0.7, 0.912, -0.311, 0.523, 0.356, 0.745, -0.366, 0.134, 1.023, 0.967, -2.145, -2.033, 0.301, -0.589, 0.801, 0.078)
fifaRaw <- c(fifaRaw, -0.922, -2.422, -2.033, -0.422, -0.589, -0.533, 1.356, -0.422, 0.634, 0.412, 0.856, -0.255, 0.301, 1.134, 0.856, 0.301, 0.912, -0.311, 0.023, 1.19, -0.088, -0.422, 0.578, 0.023, -0.866, 0.467, 1.412, 0.745, -0.978, 0.356, -0.755, -2.311, 0.356, 1.134, 0.69, 0.967, -1.756, -0.533, -0.811, -2.089, 1.245, 0.301, -1.867, -0.7, 0.745, 0.578, 1.023, -0.755, 1.134, -0.144, -0.477, 0.245, -0.422, 0.412, 0.856, 0.356, 0.856, 0.356, 1.356, 0.245, -1.756, -0.311, -0.811, 0.467, -2.2, -0.422, 0.412, 0.189, 0.356, 0.412, 0.356, 0.078, 0.69, 1.634, 1.69, -1.922, 0.523, -0.033, -0.644, 0.134, 1.19, -2.089, -0.088, 0.967, -2.2, 0.801, 1.301, 1.023, 0.69, 0.245, 1.523, -2.256, 0.412, 0.189, 0.245, 0.467, -1.922, -2.2, -0.033, 0.912, 0.245, -0.422, -1.922, 1.412, -0.477, -2.089, 0.301, 0.801, -0.589, 0.412, 0.023, 0.578, 0.245, 0.745, 0.189, 0.356, 0.967, -0.755, -0.033, 1.023, -0.644, -0.033, -0.144, 0.634, -2.256, 0.189, 1.579, -0.033, 0.078, 1.19, 1.023, 0.078, 0.856, -1.978, -0.589, -0.144, -2.2, 0.578, 0.356, 0.967, -0.033, -2.2, 1.134, 1.134, 0.912, 0.134, 0.523, -0.311, 0.189, 0.578, 0.412, -0.144, 1.245, -0.255, 0.078, -0.422, -2.256, -0.811, -0.144, 0.912, -2.256, 0.356, 0.634, 1.245, -0.144, 1.023, 0.023, 0.023, 0.801, 1.579, 0.634, 0.912, 0.412, -1.756, 0.801, -1.867, -0.922, -0.533, 0.245, 0.856, -0.255, -0.644, -0.2, 0.023, 0.801, -0.422, 0.745, -0.033, -2.256, 1.023, 1.467, 0.745, -0.033, 1.023, 0.412, -1.2, 1.023, 1.245, 0.69, 0.69, 0.301, -0.589, -2.033, 0.134, 0.467, 0.856, -0.033, 0.634, 0.189, 0.189, -1.811, 0.301, 0.356, 0.412, 0.356, 0.356, 0.912, 0.801, 0.801, 0.412, 0.245, 0.69, 0.801, 0.301, -0.866, 1.023, -1.922, 0.189, 0.745, 0.523, 0.356, -0.311, -2.256, 0.912, -2.311, -0.477, 0.301, 0.745, 0.967, 1.078, 0.801, 1.301, 0.634, -0.2, 0.634, 0.801, 0.578, 0.189, -0.533, 0.245, -2.2, 0.578, 1.412, 0.023, 1.301, -1.978, -2.2, 0.523, -0.2, 0.467, 0.856, -0.589, -0.422, 0.023, 0.412, 1.19, 0.912, -0.422, 1.634, -0.7, -2.2, 0.578, 0.467, -0.755, 0.023, 0.634, -1.7, -1.756, -1.978, 2.19, -0.366, 0.912, 1.301, 0.467, -0.033, -0.7, -0.811, 1.467, 1.412, -0.2, 1.134, 0.578, -0.811, 1.078, 0.134, -2.534, 1.023, -0.477, -0.311, -0.144, 0.467, -0.533, 0.061, 0.26, 0.458, -2.386, 0.26, -2.121, -0.137, -0.203, 0.193, 1.318, 0.26, 0.723, 0.723, 0.392, 1.252, -1.063, 1.119, 0.921)
fifaRaw <- c(fifaRaw, 0.855, -1.063, -1.923, 0.656, -0.865, -1.261, 0.127, -2.055, 0.326, 0.524, 0.656, 1.252, -0.931, -1.658, 0.061, 0.656, -1.592, 0.193, -0.005, 0.127, -0.005, 1.053, 0.326, 0.789, -0.732, 0.458, 0.656, 0.656, 0.127, 0.855, -1.923, 0.193, 1.45, 0.392, 0.458, -1.658, 0.59, -1.923, 0.855, -0.071, -1.46, 0.524, 0.524, 0.789, 0.392, 0.723, 0.127, 0.855, -0.005, -1.394, 0.656, 0.458, -0.269, -0.203, 0.26, 2.243, 0.59, -1.724, 0.921, -0.203, 0.723, 0.59, 0.392, -0.666, 0.127, -0.005, -2.187, 0.127, -1.658, -0.666, -1.526, 0.458, 0.59, 0.26, 0.127, 0.921, 0.524, 0.789, -1.195, 0.921, -0.6, -0.402, -0.203, 0.789, 0.458, -1.261, 0.061, -0.402, -1.989, 0.392, 0.193, 1.516, -2.452, -0.931, 1.119, -2.253, -0.6, 0.723, 0.656, -0.534, -0.137, -0.005, 0.458, 0.789, -0.137, 0.855, -2.319, -1.658, -0.005, 1.384, 0.789, -1.46, -0.005, -0.6, -0.6, -0.137, 0.458, 0.193, 0.127, -2.386, 0.26, -0.203, 0.987, 0.59, 1.45, 0.987, 0.193, 0.326, 1.053, -1.327, 0.921, 0.458, -0.203, 0.193, 0.127, 0.392, 1.185, 0.26, 0.127, 1.053, 0.59, -0.203, 0.723, 0.326, 0.193, 0.193, -1.857, 0.59, -0.6, -1.724, -2.055, -1.195, 0.656, 0.326, -1.658, -1.526, -0.137, 0.458, 0.326, 0.524, 0.392, 0.326, 0.127, -2.452, 0.326, 0.392, 0.392, 0.127, 0.921, 0.061, 0.127, 0.392, -0.6, 0.789, -2.187, -2.716, -1.526, 0.987, -0.071, 0.127, 1.252, -0.931, -0.402, 0.127, 0.458, 0.392, 0.524, 0.061, 0.193, 1.252, 0.656, 1.252, 0.458, 0.061, 0.458, 0.127, 1.119, 0.26, -0.005, -0.071, 0.061, -0.005, 1.053, -0.6, 0.392, 0.656, 1.252, 0.127, 0.921, 0.193, 0.26, -2.386, 0.127, 0.392, -0.005, 1.318, -2.187, 0.458, 0.921, -2.253, 0.656, 0.193)
fifaRaw <- c(fifaRaw, -2.055, 0.524, 1.318, 0.326, 0.656, 0.789, -0.005, -0.732, -1.394, 0.061, 0.392, 0.656, 0.26, 0.855, -0.269, 0.326, -0.269, 0.921, -2.319, 0.326, -0.071, -0.402, -2.187, -0.336, 0.921, 0.326, 0.723, 0.59, 1.45, 0.326, -1.394, 0.656, 0.723, -1.526, 0.524, -0.666, -0.203, 0.458, 1.053, -1.658, -0.402, 0.656, -1.857, 0.392, 0.921, 0.789, 0.855, 0.789, 1.252, -2.319, 1.119, 1.053, 1.185, -1.129, -1.195, -2.518, -0.666, 1.185, 1.252, 0.326, -2.452, 1.516, -0.071, -2.452, 0.656, -0.6, 0.921, 1.714, 0.458, -0.137, 0.458, -0.336, 0.524, 1.318, 0.656, -0.137, -0.997, 0.59, 1.053, -0.402, -1.195, 0.326, -0.666, -0.005, 0.326, 1.185, -0.137, 0.855, -0.203, -1.658, 0.326, -1.989, -0.666, -0.336, -2.386, 0.127, -0.269, 0.524, 0.59, -2.187, 0.392, 0.524, 0.524, 0.987, 0.061, 0.458, -1.658, 0.855, 0.855, 0.458, 0.987, -0.137, 0.723, 1.252, -0.865, 0.061, 0.326, 0.127, -1.526, -0.997, 0.921, 0.656, 0.26, -0.137, 0.789, 0.26, 1.119, 1.185, 0.656, -1.195, 0.789, -1.46, -0.402, -2.782, 0.855, -0.534, 0.26, 1.053, 0.524, -0.732, 0.193, -0.269, 0.392, 0.326, 0.193, -0.005, -2.055, 0.987, -0.865, 0.326, -0.798, 0.855, -0.137, -1.989, 0.458, -0.203, 0.26, 0.789, 1.252, -0.997, -2.386, 0.921, 0.326, 0.326, 0.458, 0.723, 0.458, 1.384, -2.253, -0.005, 0.723, 0.061, 1.582, 0.326, 0.127, 0.723, -0.666, 0.524, 0.524, 1.318, 0.656, 0.193, 0.26, 0.458, -2.848, 0.127, 0.921, 0.326, 1.318, 0.392, -1.989, 0.789, -2.253, -0.865, 1.185, -0.732, 1.318, -0.137, 0.855, 0.458, 0.193, 0.061, 0.392, 1.252, -0.402, 0.061, 0.193, 0.987, -2.716, 0.326, -0.203, 0.59, 0.59, -1.592, -1.79, -1.394, -0.402, 0.524, 0.392, -0.269, -0.865, 0.458, 0.59, -0.402, 0.26, 0.127, 0.987, 0.524, -1.923, 0.127, 0.723, -0.137, 0.458, -0.666, -0.468, -2.716, -2.716, 1.318, 0.987, -0.071, 0.59, -0.071, 0.789, 0.061, 0.326, 0.723, 0.987, 0.26, -0.137, -1.195, -0.666, 0.723, 0.26, -2.584, 0.987, 0.061, 1.053, 1.384, -0.005, -1.46, 1.011, 0.622, 0.733, -1.488, -0.044, -1.599, -0.877, 0.456, -0.266, 0.345, -0.044, 0.345, 1.289, 0.456, 1.177, 0.011, 1.233, 1.622, 1.177, -0.933, -1.544, 0.289, -1.599, -0.711, -0.6, -1.544, 0.955, 1.066, 0.9, 0.511, -0.766, -0.6, -0.377, -0.211, -1.321, 1.233, 0.511, -1.544, -1.099, 0.789, -0.655, 0.233, -1.821, 0.011, 0.178, 1.066, 1.455, 0.289, -1.655, 0.011, 1.122, -0.322, 0.233, -1.655, 0.178, -1.266, 0.289, -1.099, -2.043, 0.122, 0.955, -0.488, 0.622, -0.044, 0.178, -0.6, 0.233, -0.877, 0.567, 1.344, 1.122, 0.844, 0.567, 2.233, 0.4, -1.377, 0.789, 0.955, 0.122, 1.4, -0.655, -1.21, 0.011, 0.233, -1.544, 1.233, -0.711, -0.766, -0.766, 0.233, 0.4, 0.067, -0.933, 0.9, 0.955, 2.288, -0.6, 1.066, -0.711, 1.455, 0.567, 0.067, -0.266, -0.877, -0.766, -0.655, -0.877, 0.122, -0.155, 1.622, -1.821, -1.155, -0.1, -1.821, -0.933, 0.955, 1.122, 0.733, 1.122, 0.733, -0.488, 0.456, -1.21, 0.567, -1.599, -1.21, -0.433, 0.955, 0.4, -1.21, 0.178, -0.711, -0.711, 0.345, 0.4, 0.233, 0.345, -1.655, -0.322, -0.766, 0.789, -0.155, 1.455, 0.345, 0.289, -0.988, 0.456, -1.655, 1.4, 0.678, 0.456, 1.344, -0.1, -1.599, 0.456, 0.511, 0.844, 1.011, -0.155, -0.711, 0.511, 1.177, -0.377, -0.6, -2.099, 0.511, 0.678, -1.266, -1.488, -1.599, 0.178, 1.066, -0.822, -1.766)
fifaRaw <- c(fifaRaw, -0.544, 0.844, 1.289, 1.011, 0.178, -0.1, 0.289, -1.821, 0.733, -1.044, 1.233, -0.044, 1.289, -0.655, -0.6, 0.011, -1.044, 1.289, -1.932, -1.488, -0.544, 0.233, 0.733, -0.766, -0.044, -1.877, -1.766, -0.044, 0.622, -1.044, 0.733, -0.655, 1.566, 0.122, 1.622, 1.4, 1.011, -0.488, -0.266, 0.289, 2.01, 0.678, 0.789, -0.377, -0.711, -0.544, 0.844, -0.488, -0.211, -0.711, 1.622, -1.044, -0.044, 0.844, 0.178, -1.821, -0.6, -0.711, 0.733, 0.678, -1.655, 1.066, 1.566, -1.988, -0.766, 0.456, -1.599, 0.122, 1.344, 0.622, 1.844, -0.155, -0.877, -0.322, -0.6, 1.122, 0.067, -0.433, -0.433, 1.177, -1.266, 0.289, -0.877, 1.455, -1.988, -0.155, -0.211, -0.711, -1.988, -0.211, -0.044, -0.6, 0.678, -0.044, 0.4, 0.178, 0.622, -0.655, 1.844, -1.766, 0.4)
fifaRaw <- c(fifaRaw, -0.988, 0.067, 0.844, 1.733, -1.544, -0.377, 1.177, -1.599, 0.955, 1.677, 1.788, -0.711, 1.233, 2.344, -1.766, 0.067, 0.955, 0.844, 0.233, -1.488, -1.988, 1.122, -0.544, -0.6, 0.4, -1.377, 1.899, -0.377, -1.821, 0.511, -0.711, 1.677, 1.955, -0.988, -0.433, 1.289, 0.011, -0.377, 1.788, 0.011, -0.266, -0.433, 0.289, 1.455, 0.067, -0.822, 0.678, -1.544, -0.544, -1.377, 2.233, -0.322, -0.266, 1.455, -0.766, 0.678, -1.488, 0.511, 0.9, -1.821, -1.599, -0.933, -0.488, 0.289, -1.932, -0.211, -0.488, 0.955, 0.733, -0.433, 1.177, -0.822, -0.822, -0.1, 0.122, 1.122, -0.711, 0.345, 1.289, -1.488, -0.155, -0.044, -0.766, -1.21, -0.211, 0.844, 0.067, -0.711, -0.766, 0.733, -0.488, 1.566, -0.377, 0.511, 1.566, 0.844, -1.377, 0.345, -1.71, 0.789, -1.21, 0.678, -0.1, 0.955, -0.266, 0.789, -1.099, -0.711, -0.155, 0.622, -0.766, -1.544, 1.899, -0.377, 1.233, -0.488, -0.377, -0.655, -1.21, -0.377, 0.345, 0.955, 0.622, 0.289, -0.988, -1.321, 1.788, -0.1, 0.456, 0.9, 1.4, 0.678, 0.511, -1.71, -0.766, 0.289, -1.044, 0.178, 0.9, -0.155, -0.044, -0.044, 0.511, 1.289, 1.4, 1.066, 0.622, 1.011, 1.289, -1.544, -0.433, 1.4, 0.233, 0.955, -0.155, -1.988, -0.544, -1.544, 0.678, 1.511, 0.511, 1.011, -0.877, 0.067, -0.433, 0.622, -0.266, -0.655, 0.955, -1.155, 0.4, 0.289, 1.011, -1.655, 0.122, 1.011, 0.233, 1.289, -1.544, -1.432, 0.289, 0.4, -0.155, 1.455, -0.6, -0.044, 0.9, 0.233, 1.566, -0.711, -0.488, 1.4, -0.044, -1.599, 0.289, 0.9, -0.322, 0.011, -0.655, -1.21, -1.377, -1.599, 1.344, 1.011, -0.766, 1.122, -1.155, 0.4, -0.211, -0.155, 1.788, 1.066, -0.044, -0.1, -0.655, -0.655, 1.733, 0.345, -1.766, 0.678, -0.544, -0.1, 1.677, 0.067, -0.766, 0.824, 0.302, 0.824, -1.837, 0.093, -2.15, -1.107, 0.458, 0.615, 0.719, -0.22, 0.928, 0.406, 0.615, 1.293, -0.585, 0.771, 1.397, 0.771, -0.324, -2.046, 0.458, -1.89, -0.272, -0.063, -2.255, 0.615, 0.406, 0.51, 0.824, -1.055)
fifaRaw <- c(fifaRaw, 0.197, 0.302, 0.615, -1.837, 0.25, 0.406, 0.354, -0.168, 0.876, 0.145, 0.563, -2.255, 0.458, 0.928, 1.189, 0.667, 0.615, -2.203, 0.145, 1.137, 0.563, 0.563, -2.15, 0.093, -1.785, 0.98, -1.159, -2.307, 0.667, 0.302, 0.563, 0.719, 0.928, 0.145, 0.876, 0.458, -1.107, 0.667, 0.667, 0.197, 0.406, 0.667, 1.397, 0.458, -2.203, 0.719, 0.406, -0.481, 0.667, -0.376, 0.197, 0.51, 0.093, -2.203, 0.667, -1.42, -0.063, -1.211, -0.585, 0.25, -0.116, 0.25, 0.51, 0.667, 0.406, -1.003, 1.084, 0.719, 0.667, 0.145, 0.771, 0.51, -1.368, 0.876, -1.42, 0.145, 0.771, -0.533, 0.51, -2.359, -0.846, 1.084, -2.411, -0.846, 0.667, 0.771, 0.406, 0.458, 0.771, 0.145, 0.824, -1.942, 0.719, -2.255, -1.472, -1.211, 1.189, 0.093, -1.89, 0.667, -0.585, -0.324, 0.563, 0.145, -0.116, 0.51, -2.359, -1.159, -0.168, 0.719, 0.302, 1.867, 0.876, 0.406, 0.771, 0.041, -2.046, 0.98, 0.563, 0.093, 0.615, 0.041, -0.22, 0.615, 0.824, 0.406, 0.667, 0.719, -0.063, 0.51, 0.51, 0.041, 0.406, -1.577, 1.084, 0.302, -1.785, -2.046, -2.098, 0.458, 1.084, 0.25, -2.046, 0.406, 0.406, 1.032, 0.667, 0.354, 0.093, 0.51, -2.359, 0.667, 0.406, 0.615, 0.354, 0.197, -0.846, 0.041, 0.145, -0.742, 0.458, -2.359, -1.942, -1.316, 0.615, 0.719, 0.563, 0.51, -1.994, -2.255, 0.145, 0.563, 0.458, 0.51, 0.25, 0.719, 0.197, 0.98, 1.397, 1.032, -0.116, 0.876, 0.041, 1.293, 0.563, 0.563, -1.003, 0.145, -0.324, 0.667, -0.324, 0.51, -0.168, 0.876, -0.481, 0.615, 0.458, 0.667, -2.255, -0.168, 0.041, 0.51, 1.032, -2.098, 0.667, 0.876, -2.516, 0.354, 0.667, -2.203, 0.145, 0.928, 0.302, 0.51, 0.458, -0.481, -0.324, 0.145, 0.615, 1.137, 0.458, -0.585, 0.51, -1.159, 0.458, -0.898, 0.928, -2.307, 0.51, 0.197, -1.159, -2.203, 0.197, -0.116, 0.25, 0.458, 1.032, 0.876, 0.197, 0.145, -0.376, 0.563, -2.307, 0.354, -0.585, 0.51, 0.824, 1.241, -1.89, -0.168, 0.719, -2.203, 0.771, 1.137, 0.667, 0.98, 0.771, 0.928, -2.463, 0.302, 0.25, 1.189, -0.063, -2.046, -2.307, -0.168, 0.563, 0.719, 0.615, -1.837, 1.45, 0.093, -2.255, 0.615, -0.481, 0.98, 1.658, 0.145, -0.272, 0.98, -0.116, -1.003, 1.397, 0.615, 0.667, -0.481, 0.667, 0.615, 0.145, 0.041, 1.241, -1.89, 0.719, -0.324, 1.345, 0.041, -0.063, 0.563, -1.524, 0.041, -2.046, 0.615, 0.615, -2.307, 0.406, 0.406, -0.742, 0.615, -2.307, -0.22, -0.637, 0.615)
fifaRaw <- c(fifaRaw, 1.032, -0.116, 0.667, -1.263, -0.22, 0.98, 0.51, 0.876, -0.011, 0.51, 1.084, -1.942, -0.168, 0.197, -1.159, -2.203, -0.637, 1.293, -0.168, -0.063, 0.25, 0.719, -0.063, 1.137, 0.406, 0.458, 0.406, 0.824, -2.046, 0.041, -2.098, 1.345, -0.272, 0.145, 0.406, 0.928, -0.063, 0.563, -1.055, 0.302, 0.406, 0.145, -0.011, -2.203, 0.824, -0.481, 0.824, -0.168, 0.25, -0.011, -1.263, 0.25, -0.063, 0.406, 0.458, 0.615, -0.637, -1.733, 1.189, 0.145, -0.324, 0.615, 0.615, 0.563, 1.137, -2.098, 0.145, 0.667, 0.302, 0.824, 0.824, -0.116, 0.771, -0.22, 0.51, 0.876, 1.137, 0.51, 0.667, 0.354, 0.302, -2.046, 0.458, 1.032, 0.771, 0.824, 0.093, -2.15, 0.041, -2.255, 0.51, 0.563, 0.51, 0.824, -0.846, 0.667, -0.429, 0.406, -0.116, -0.011, 0.563, -1.211, 0.302, 0.51, 0.98, -2.307, -0.533, 0.145, 0.719, 0.667, -2.098, -2.046, 0.25, -0.272, -0.063, 0.615, -0.324, -0.22, 0.719, 0.458, 0.98, -0.116, -0.116, 0.719, 0.458, -1.89, 0.041, 1.032, 0.563, 0.458, -0.794, -1.629, -2.203, -2.203, 0.458, 0.928, -0.429, 0.667, -0.324, 0.719, 0.458, 0.354, 0.824, 1.084, 0.563, -0.168, -0.742, 0.093, 1.241, 0.719, -2.15, 0.563, -0.116, 0.51, 1.345, 0.145, -0.898, 0.902, 0.902, 0.635, -1.657, -0.164, -1.87, -0.804, 0.742, 0.582, 0.795, 0.369, -0.271, 0.902, 0.582, 1.275, -0.378, 1.915, 1.595, 1.222, -0.964, -1.55, -0.538, -1.71, -0.751, 0.635, -1.604, 1.381, 0.582, 1.275, 0.902, -1.177, -0.431, -0.271, -0.591, -1.71, 1.008, -0.218, -0.964, -1.337, 1.062, -0.697, 0.209, -1.604, 0.209, 1.275, 1.275, 0.422, 0.529, -1.764, -0.324, 1.861, -0.111, 0.582, -1.657, 0.102, -1.177, 0.529, -1.124, -1.924, 1.168, 0.902, 0.422, 0.209, 0.529, 0.049, -0.431, 0.742, -0.697, 1.222, 0.742, 0.369, 0.369, 0.155, 2.128, 0.689, -1.444, 0.955, 1.328, -0.697, -0.058, -1.017, -1.231, 0.689, -1.337, -1.817, 0.848, -0.911, -0.697, -0.911, -1.124, -0.484, -0.111, 0.689, 1.062, 1.328, 0.635, -1.071, 0.155, -0.431, -0.164, -0.431, 0.635, 0.209, -1.177, -0.378, -1.071, -0.538, 1.062, -0.591, 1.915, -1.817, -1.284, 1.115, -1.71, -1.284, 1.008, 1.275, 0.582, 0.848, 0.049, 0.742, 1.541, -1.497, 1.168, -1.71, -1.231, -0.218, 1.488, 0.902, -1.39, 0.422, -0.751, -0.644, 0.315, 0.209, -0.058, 0.049, -1.337, -0.644, -1.337, 0.209, 0.102, 1.328, 0.209, 0.315, -0.964, 0.848, -1.87, 0.689, 0.422, 0.742, 0.689, -0.164, -0.484, 0.902, 0.955, 1.062, 0.902, 0.422, -0.697, 0.582, 0.582, 0.209, -1.071, -1.231, 0.848, -0.111, -1.39, -1.71, -1.87, 0.102, 0.369, -0.484, -1.764, -0.111, 0.475, 0.582, 0.795, -0.004, -0.324, 0.315, -1.764, 0.102, -0.271, 1.168, -0.378, 0.955, -0.004, -0.644, 0.582, -0.804, 0.475, -1.817, -1.817, -0.857, 1.115, -0.697, 0.049, 0.102, -1.977, -1.764, 0.262, 1.062, -0.004, 0.742, -0.911, 0.102, 1.861, 1.755, 1.648, 0.262, -0.484, 0.582, -0.111)
fifaRaw <- c(fifaRaw, 1.915, 0.315, 1.115, -1.124, -0.697, -0.164, 1.115, -0.857, 0.102, 1.168, 1.861, -0.857, 0.529, 0.635, 0.582, -1.87, 0.102, -1.284, -0.111, 0.635, -1.604, 0.689, 1.488, -1.924, -0.697, 0.369, -1.497, 0.315, 1.435, 0.635, 0.262, 0.475, -0.591, -0.484, -0.591, 0.635, 1.488, -0.218, -0.857, 0.848, -0.697, -0.058, -1.124, 0.795, -1.87, -0.484, 0.102, -0.804, -1.924, -0.324, -0.538, -0.644, -0.644, 0.475, 0.422, 1.062, -0.378, -0.538, 0.209, -1.817, 0.422, 0.102, 0.049, 1.541, 1.328, -1.657, 0.742, 1.062, -1.39, 0.155, 1.488, 0.475, 0.848, 1.222, 1.435, -1.764, 1.541, 0.155, 1.648, -0.964, -1.817, -1.924, -0.804, 0.315, -0.218, -0.058, -1.55, 2.128, 0.049, -1.817, 1.008, -0.644, 1.488, 2.341, -0.804, -0.964, 0.795, -0.591, -0.484, 1.595, 0.848, 0.102, -0.164, 1.381, 1.381, 0.102, -0.697, 0.155, -1.87, -0.324, -0.431, 1.701, -0.591, 1.381, 0.529, -0.804, 0.209, -1.444, 0.049, 0.475, -1.87, 0.529, -0.644, -0.751, 0.155, -1.657, -0.218, -0.751, 0.635, 1.008, -0.111, 1.168, -1.337, -0.058, 0.422, -0.164, 0.848, -0.697, 0.315, 1.328, -1.604, -0.271, 0.742, -0.857, -1.977, -0.591, 1.008, -0.271, -0.538, -1.017, 0.529, -0.538, 1.168, -1.284, 1.115, -0.431, 0.369, -1.817, -0.271, -1.817, 1.115, -0.911, 0.689, 0.262, 0.742, -0.004, 0.795, -0.857, 0.529, 0.422, 1.381, -0.804, -1.444, 2.074, -1.444, 0.262, -0.218, 0.209, 0.262, -0.804, -0.804, 0.529, 1.008)
fifaRaw <- c(fifaRaw, 0.049, 1.008, -0.538, -1.444, 1.222, -0.164, -0.378, 1.222, 1.328, 0.369, 1.275, -1.817, -0.644, 0.529, 0.529, 1.062, -0.004, -0.538, -0.058, -0.911, 0.529, -0.697, 1.915, 0.529, 0.689, 1.222, 0.902, -1.497, 0.848, 1.062, 0.369, 1.008, 0.635, -1.924, -0.644, -1.657, 0.369, 0.955, -0.697, 0.155, -1.231, 1.115, -0.218, -0.644, 0.422, -0.538, 0.475, -1.071, 0.315, 0.209, 1.595, -1.55, -0.484, 0.369, 1.222, 0.689, -1.604, 0.475, -0.697, 0.848, -0.004, 1.168, -0.697, -0.857, 1.275, 0.475, 0.955, -0.591, -0.644, 0.049, 0.582, -1.444, -0.271, 1.168, 0.635, 0.529, -1.124, -1.231, -0.857, -1.817, 1.488, 1.435, -0.857, 0.369, 0.529, 1.168, 0.955, -0.111, 0.369, 1.755, 0.955, 0.155, -0.751, -0.164, 0.475, 0.422, -1.71, 0.742, -0.484, 0.848, 1.541, 0.155, -1.231, 1.237, 0.747, 0.747, -1.432, -0.015, -1.649, -0.887, 0.148, 0.257, 0.91, -0.015, -0.07, 1.292, -0.124, 1.292, -0.342, 2.381, 0.856, 1.128, -0.941, -1.595, 0.856, -1.432, -0.669, 0.311, -1.704, 1.346, 0.039, 1.455, 1.401, -0.724, -0.397, -0.451, -0.724, -1.432, 0.747, -0.124, -0.941, -1.05, 0.747, -0.941, -0.07, -1.649, -0.615, 1.401, 1.237, -0.179, 1.183, -1.486, -0.669, 1.891, -0.342, 0.856, -1.54, -0.015, -1.159, 0.039, -0.996, -1.595, -0.669, 1.618, 0.529, 0.148, 0.475, 0.257, -0.015, 0.856, -0.669, 0.529, 0.148, -0.179, 0.093, -0.233, 2.272, 0.42, -1.704, 0.91, 1.891, -0.451, 0.747, -0.07, -0.615, 0.202, -1.05, -1.595, 1.019, -0.724, -0.615, -0.887, 0.693, -0.124, -0.397, -0.288, 0.91, 1.618, 0.42, -0.724, 0.148, -0.124, 0.202, -0.778, -0.233, 0.475, -1.159, -1.105, -0.669, -0.669, 0.42, -1.05, 2.163, -1.704, -1.214, 1.019, -1.649, -0.941, -0.506, 1.237, 1.074, 0.747, -0.56, 1.401, 1.51, -1.486, 1.401, -1.323, -0.832, -0.179, 0.856, -0.015, -1.649, -0.179, -0.506, -0.288, 0.039, 0.039, 0.257, -0.342, -1.105, -0.778, -0.941, 0.747, 0.366, 1.455, 1.401, 1.237, -0.506, 0.366, -1.649, 1.128, 0.747, -0.451, 0.638, -0.451, -0.179, 0.747, 0.802, -0.288, 1.292, 1.128, -0.56, 0.693, -0.179, 0.638, -0.615, -1.323, 1.237, -0.288, -1.486, -1.595, -1.54, 0.366, 0.693, -0.179, -1.54, -0.615, 1.074, 0.475, 0.91, 0.039, -0.179, 0.965, -1.704, 0.747, 0.093, 1.074, -0.397, 1.237, -0.615, -0.342, 0.366, -0.832, 0.856, -1.758, -1.323, -0.724, 1.237, -0.615, -0.124, -0.179, -1.813, -1.758, -0.179, 1.292, -0.288, 0.311, -0.451, -0.07, 1.673, 1.292, 1.455, 1.128, -1.05, 0.148, 0.202, 1.891, 0.148, -0.669, -0.832, -0.669, -0.07, 1.128)
fifaRaw <- c(fifaRaw, -0.56, 0.91, 1.727, 1.891, -1.377, 1.401, -0.506, 1.183, -1.649, -0.451, -0.941, -0.778, 0.856, -1.432, 0.529, 1.401, -1.758, -1.05, 0.42, -1.323, 0.747, 1.564, 1.074, 0.257, 0.093, -0.451, -0.724, -0.724, 0.148, -0.124, -0.56, -0.778, 0.91, -0.124, 0.148, -0.887, 1.51, -1.649, -0.451, -0.669, -0.342, -1.54, -0.342, -0.124, -0.615, -0.615, 0.42, 0.475, 0.475, -0.397, 0.093, 0.366, -1.758, -0.124, -0.996, -0.015, 1.074, 0.584, -1.704, 0.638, 1.128, -1.649, -0.233, 1.292, 0.638, -0.288, 1.836, 0.747, -1.486, 1.618, 0.747, 1.836, -1.105, -1.595, -1.758, -0.887, -0.07, -0.288, -0.179, -1.268, 2.435, -0.342, -1.704, -0.07, -0.887, 1.727, 2.272, -0.778, -0.233, 0.257, -0.07, -0.778, 1.618, 0.965, -0.179, 0.148, 1.618, 1.727, -0.451, -0.669, -0.506, -1.268, 0.148, 0.856, 1.51, -0.397, 0.475, 0.965, -1.105, -0.179, -1.105, 1.128, 0.093, -1.595, -0.724, -0.669, -1.214, 0.856, -1.758, 1.019, -0.778, 0.91, 0.856, -0.233, 0.584, -0.778, -0.397, -0.506, -0.179, 1.074, -0.506, 0.311, 0.91, -1.214, -0.288, 0.42, -0.615, -1.649, -0.724, 0.91, 0.311, -0.015, -0.451, 1.346, -0.288, 1.51, 0.366, 1.019, -0.669, 0.747, -1.867, -0.342, -1.704, 0.747, -0.724, 1.074, 0.257, 1.128, -0.397, 1.074, -0.778, 0.093, 0.802, 1.945, -0.288, -1.649, 2.109, -1.323, 2, -0.451, -0.724, -0.179, -1.758, -0.832, -0.342, 1.836, -0.56, 1.237, -0.778, -1.54, -0.179, -0.015, -0.56, 1.346, 0.965, 0.802, 1.51, -1.268, -0.778, -0.124, 0.747, 1.346, 0.475, -0.56, -0.288, -0.832, 0.202, -0.778, 2.272, 0.802, 1.564, 1.51, 1.128, -1.595, 0.693, 0.42, -0.124, 1.51, 0.91, -1.758, -0.07, -1.432, -0.015, 0.965, -0.724, 0.91, -1.105, 1.074, 0.257, -0.179, 0.093, -0.669, -0.015, -0.56, 0.093, -0.124, 1.891)
fifaRaw <- c(fifaRaw, -1.432, -0.397, 0.148, 1.564, 0.802, -1.649, 0.91, -0.778, -0.778, 0.093, 1.292, -0.56, -0.724, 1.346, -0.506, -0.887, -0.724, -0.342, -0.124, 0.693, -1.649, -0.288, 0.529, 1.074, 0.475, -0.832, -1.105, -1.214, -1.377, 1.618, 1.455, -0.887, 0.093, -0.996, 0.91, 1.401, -0.179, 0.039, 1.836, 0.802, -0.832, -0.778, -0.56, -0.07, -0.506, -1.758, 0.638, -0.233, 1.019, 1.346, -0.015, -0.832, -0.572, 0.269, 0.01, -2.124, 0.463, -1.801, -0.572, 0.075, -0.054, 1.045, 0.398, 0.528, 0.463, 0.463, 1.239, -1.089, 1.369, 0.786, 1.045, -0.96, -2.06, 0.786, -0.507, -1.025, 0.14, -1.348, 0.01, 0.722, 1.11, 1.239, -0.119, -1.348, -0.507, 0.722, -2.06, -0.119, -0.313, -0.184, 0.334, 1.11, 0.075, 1.304, -1.93, 0.722, 0.075, 0.075, -0.507, 0.851, -1.866, 0.398, 1.821, 0.334, 0.398, -1.995, 0.592, -1.93, 0.851, -0.119, -1.219, -0.119, 0.722, 0.98, 0.14, 0.528, 0.14, 0.592, -0.184, -1.477, 0.592, 0.592, -0.96, -0.895, 0.075, 2.598, 0.528, -1.283, 1.11, 0.204, 0.98, -0.637, 0.269, -0.443, 0.657, -1.283, -1.995, 0.075, -1.736, -0.507, -1.801, 0.722, -0.054, 0.528, 0.075, 1.045, 0.786, 0.657, -1.736, 0.851, 0.204, -0.96, -0.313, 1.304, 0.722, -1.866, 0.14, -0.507, -1.542, 0.592, -0.119, 1.692, -1.413, -0.637, 1.239, -2.124, -0.701, 0.592, 0.786, -0.766, -0.507, -1.219, 0.786, 0.851, -0.184, 1.11, -1.995, -1.801, -0.572, 1.433, 1.304, -1.154, 0.14, -0.119, 0.01, 0.14, -0.119, 0.334, -0.054, -2.642, 0.334, -0.054, 0.851, 0.851, 1.369, 1.304, -0.054, 0.463, 1.304, -1.089, 0.463, 0.463, -0.766, -0.119, 0.204, -0.184, 1.175, 0.398, 0.075, 1.498, 0.657, -0.184, 0.916, -0.249, 0.463, 0.98, -0.184, 0.204, -1.542, -0.766, -1.866, -1.154, 0.592, 0.528, -1.283, -1.283, 0.01, 0.657, -0.443, 0.786, -0.054, 0.528, -0.119, -1.866, -0.119, 0.204, 0.463, -1.219, 1.045, -0.249, 0.398, 0.592, 0.075, 1.045, -1.607, -2.189, -1.154, 1.239, -0.766, -0.313, 1.498, -0.96, 0.463, 0.722, 0.398, 0.722, -0.119, 0.14, 0.269, 1.433, 0.916, 1.433, 0.657, 0.592, -0.313, 0.592, 1.11, 0.204, -0.054, -0.119, -0.119, -0.249, 1.045, -0.054, 0.463, 0.463, 0.916, -0.054, 0.592, 0.398, 0.14, -1.93, 0.269, 0.269, -0.766, 1.239, -1.93, 0.463, 1.11, -1.995, 0.722, 0.269, -2.254, 0.786, 1.11, 1.11, 0.916, 0.851, -0.119, -1.348, -1.477, -0.443, -0.766, 1.11, -0.119, 0.786, 0.01, 0.075, -0.054, 0.722, -1.607, 0.398, 0.01, -0.637, -1.801, 0.01, 0.851, 0.14, 0.98, 0.786, 1.239, 0.786, -1.672, 0.334, -0.507, -1.801, 0.528, -0.119, -0.313, 0.722, 0.398, -1.219, -0.378, 0.851, -1.089, -1.089, 0.657, 0.657, 0.398, 0.98, -0.378, -1.672, 1.498)
fifaRaw <- c(fifaRaw, 1.175, 1.369, -1.219, -0.96, -2.254, -0.831, 1.369, 0.98, -0.119, -1.995, 1.692, -0.184, -2.383, 1.11, -1.154, 0.916, 1.757, 0.269, -0.054, -0.637, -0.766, 0.916, 1.11, 0.528, 0.204, -1.348, 0.657, 1.304, -0.507, -1.025, 0.463, -0.443, -0.766, 0.528, 1.045, 0.14, 1.821, -0.831, -1.283, -0.766, -1.801, 0.204, -0.184, -2.383, 0.398, -0.443, -1.154, 0.786, -1.995, 0.463, 0.98, 0.916, 0.851, 0.398, 0.722, -1.477, 1.045, 0.98, 0.398, 1.304, -0.184, 0.851, 1.433, -1.283, 0.204, 0.592, -0.054, -1.477, -1.025, 0.98, 0.657, -0.184, 0.14, 1.304, 0.14, 1.175, 1.045, 0.916, -1.283, 0.98, -1.93, -1.283, -2.512, 0.851, -0.507, 0.528, 0.98, -0.054, -0.701, -0.378, -0.313, 0.334, 0.528, 0.463, 0.075, -1.607, 1.175, -0.507, 1.045, 0.01, 0.916, 0.01, -2.124, 0.463, 0.398, 0.592, 0.01, 1.627, -0.766, -1.93, 0.334, 0.528, 0.592, 0.334, 0.916, 0.592, 1.369, -2.06, -0.054, 0.98, 0.334, 1.304, -0.378, -0.054, 0.98, -1.995, 0.398, -0.831, 1.627, 0.851, 0.269, 0.592, 0.075, -2.383, 0.204, 0.657, 0.528, 1.239, 0.592, -2.448, 0.851, -1.413, -0.895, 1.433, -1.154, 1.498, -0.443, 1.045, 0.657, 0.592, 0.657, 0.98, 1.11, -0.313, 0.075, -0.572, 0.592, -2.642, 0.075, 0.14, 0.269, 0.269, -1.542, -1.477, -1.348, 0.14, 0.528, 0.204, -0.119, -1.348, 0.334, 0.592, -0.96, 0.398, 0.14, 0.463, 0.722, -1.801, -0.96, 0.398, -0.184, 0.398, 0.14, -0.637, -2.706, -1.866, 1.563, 0.98, 0.204, -0.249, 0.334, 1.045, -0.313, 0.463, 0.334, 0.722, 0.528, -0.054, -0.831, -0.701, -0.443, -0.313, -1.607, 1.175, 0.14, 0.851, 1.433, -0.119, -1.542, 0.613, 0.213, 0.556, -2.13, -0.358, -2.073, -0.53, 0.556, 0.156, 0.842, -0.358, 0.842, 0.442, 0.385, 1.071, -0.072, 0.899, 1.471, 0.899, -0.53, -2.302, 0.556, -0.987, -0.873, 0.042, -2.245, 0.385, 0.613, 0.613, 0.842, -0.701, -0.53, 0.213, 0.671, -1.616, 0.213, 0.213, 0.156, -0.015, 0.956, 0.27, 0.556, -2.073, 0.499, 0.899, 0.956, 0.099, 0.671, -1.902, 0.213, 1.242, 0.556, 0.385, -2.187, 0.27, -2.302, 1.128, -0.816, -1.502, 0.556, 0.556, 0.671, 0.728, 0.671, 0.442, 0.842, 0.042, -0.93, 0.213, 0.671, 0.099, -0.015, 0.385, 1.871, 0.385, -2.473, 0.842, 0.442, 0.213, 0.956, 0.213, 0.328, 0.442, 0.213, -2.016, 0.613, -1.387, -0.13, -1.673, 0.442, 0.385, 0.442, 0.213, 0.671, 0.613, 1.128, -0.987, 0.956, -0.015, 0.385, 0.385, 0.728, 0.27, -1.159, 0.213, -1.502, -0.987, 0.556, -0.873, 0.956, -2.473, -0.473, 1.242, -2.302, -0.244, 0.499, 0.671, 0.042, 0.442, 0.499, 0.27, 0.613, -1.101, 0.671, -1.844, -1.444, -0.587, 1.071, 0.556, -2.187, 0.156, -0.473, -0.187, 0.27, 0.556, -0.015, 0.156, -2.187, -1.273, -0.701, 0.842, 0.556, 1.757, 0.899, 0.042, 0.613, 0.785, -2.302, 0.842, 0.728, -0.015, 0.671, -0.187, 0.27, 0.956, 0.499, 0.328, 0.842, 0.728, -0.015, 0.613, 0.556, 0.042, 0.328, -2.016, 0.842, 0.27)
fifaRaw <- c(fifaRaw, -1.73, -1.959, -1.844, 0.385, 0.499, -0.987, -1.559, 0.042, 0.385, 0.842, 0.499, 0.099, 0.156, 0.499, -2.416, 0.499, 0.442, 0.556, 0.27, 0.499, -1.044, -0.072, 0.042, -0.644, 0.728, -2.645, -2.359, -1.387, 0.613, 0.842, 0.213, 0.499, -2.073, -2.645, 0.156, 0.499, 0.042, 0.556, -0.015, 0.613, 0.899, 0.956, 1.357, 0.556, 0.042, 0.556, 0.156, 1.299, 0.213, 0.499, -0.301, 0.156, -0.187, 0.671, -0.187, 0.328, 0.613, 0.842, -0.072, 0.213, 0.27, 0.442, -2.245, 0.156, 0.385, 0.328, 1.185, -2.302, 0.442, 1.014, -2.302, 0.213, 0.499, -1.73, 0.613, 0.899, 0.27, 0.613, 0.328, -0.301, -0.415, -0.873, 0.328, 0.842, 0.613, -0.587, 0.556, -0.301, 0.099, -0.415, 0.785, -2.416, 0.27, -0.015, -0.987, -2.187, -0.072, 0.328, 0.385, 0.613, 0.956, 1.357, 0.328, 0.27, -0.072, 0.728, -1.844, 0.499, -0.587, 0.27, 0.785, 1.071, -1.844, -0.072, 0.899, -2.245, 0.899, 1.014, 0.842, 0.956, 0.785, 1.071, -2.588, 0.671, 0.785, 1.128, -0.072, -2.302, -2.416, -0.244, 0.785, 0.956, 0.156, -2.13, 1.528, 0.099, -2.702, 0.785, -0.301, 0.785, 1.871, 0.27, 0.042, 1.014, -0.015, -0.015, 1.299, 0.671, 0.499, -0.415, 0.385, 0.671, -0.072, 0.27, 0.785, -1.844, 0.613, 0.042, 1.357, -0.13, 0.613, 0.499, -1.559, 0.042, -1.844, 0.328, 0.328, -2.13, 0.213, 0.156, -0.53, 0.613, -2.588, -0.015, -0.415, 0.613, 0.728, 0.099, 0.728, -1.273, 0.442, 0.899, 0.499, 0.956, -0.13, 0.556, 1.128, -2.13, -0.13, 0.328, -0.244, -1.559, -0.987, 1.185, 0.328, 0.042, 0.213, 0.671, -0.13, 1.128, 0.499, 0.442, 0.728, 0.671, -1.959, 0.213, -2.359, 0.956, -0.587, 0.156, 0.613, 0.728, -0.415, 0.556, -0.758, 0.213, 0.156, 0.099, -0.015, -1.33, 0.842, -0.473, 0.728, -0.244, 0.671, -0.13, -1.33, 0.328, 0.27, 0.442, 0.728, 0.899, -0.93, -2.588, 1.185, 0.042, 0.156, 0.442, 0.613, 0.27, 1.014, -1.959, 0.156, 0.671, -0.415, 0.956, 0.442, 0.042, 0.728, -0.301, 0.499, 0.728, 1.185, 0.556, 0.385, 0.442, 0.613, -2.416, 0.156, 0.728, 0.842, 0.899, 0.099, -2.073, 0.099, -2.073, -0.301, 0.785, 0.385, 1.014, -0.13, 0.842, 0.156, -0.13, 0.099, 0.042, 0.671, -1.101, 0.156, 0.156, 1.014, -2.13, 0.213, -0.187, 0.671, 0.613, -2.187, -2.245, 0.099, 0.042, 0.042, 0.671, -0.244, -0.187, 0.442, 0.842, 0.842, -0.13, -0.644, 0.613, 0.556, -2.416, -0.13, 1.071, 0.499, 0.442, -0.873, -2.53, -2.416, -2.53, 1.528, 1.014, -0.072, 0.613, 0.099, 0.842, 0.099, 0.213, 0.613, 1.014, 0.27, -0.187, -0.873, -0.187, 1.071, 0.442, -2.13, 0.842, -0.072, 0.556, 1.471, 0.213, -1.73, 1.784, 0.282, 1, -2.003, -0.044, -1.481, -1.22, 0.152, 1.066, 0.543, 0.413, -0.044, 0.021, 0.478, 0.87, -0.044, -1.872, 1.523, 0.543, 0.739, -1.415, -0.436, 0.021, -0.044, 0.347, -1.611, 1.066, 0.413, -0.044, 0.739, -0.762, 0.282, 0.543, -0.24, -1.546, 0.478, 1, 0.347, -0.371, -0.175, 0.347, 0.282, -2.591, 0.347, 1.262, 1.653)
fifaRaw <- c(fifaRaw, 0.021, 0.086, -2.199, -0.11, 0.217, 1.719, 0.282, -0.893, -1.024, -1.481, 1.392, -0.11, -1.742, 0.87, 0.478, 0.086, 1.196, 0.152, 0.87, 0.217, 0.347, -0.175, 0.674, 0.805, 1.066, 0.543, 1.066, 0.021, 0.543, -1.481, 0.674, 0.805, -0.697, 0.217, -0.762, -0.24, 1.066, 0.413, -1.938, 0.739, -0.305, 0.674, -0.371, 0.282, -0.632, -0.958, 1.719, -0.697, 1.196, -2.134, 0.805, 1.719, 0.674, 1.392, 0.282, 0.152, 0.543, 0.282, 0.609, -0.762, -0.24, 0.805, 0.282, -1.481, -2.656, -0.632, 0.152, -2.656, -0.632, 1.327, 0.152, 0.543, 0.674, 0.739, -0.175, 0.021, -1.22, 0.739, -1.938, -0.11, 0.086, 0.87, -1.285, -0.175, 1.327, 0.021, 0.805, 1.066, -1.154, 0.282, 0.347, -1.22, 0.021, -0.11, 0.478, 0.413, 1.914, 0.543, 0.935, 0.282, 0.413, -1.415, 0.543, -0.371, -1.024, -0.501, 0.674, -1.677, -1.089, 1.066, 0.805, -1.154, 0.282, 0.152, 0.347, 0.805, 0.217, 0.413, -0.044, 1.327, -1.481, -1.024, -1.415, -1.742, 0.413, 1.98, 0.413, -2.134, 0.739, 0.152, 1.523, 0.739, 0.282, 0.282, 0.805, -3.113, 0.609, 0.347, 1.131, 1.327, 0.152, -0.044, 0.086, 0.217, -1.872, 0.87, -2.199, -1.22, 0.021, 0.086, 1, 0.478, -0.24, -1.807, -0.762, 0.282, 0.282, 0.805, -0.567, 0.282, 0.805, -1.611, 1.262, 1.523, 1.066, -1.611, 0.674, -0.828, 0.805, 0.543, 1.196, -1.089, 0.347, 0.152, 0.543, 0.152, 0.478, -1.481, 1.131, -1.154, 0.282, 1.327, 1.196, -2.068, 0.413, -0.567, 1.131, 0.217, -1.481, 1.327, 0.217, -2.721, 0.086, 0.152, -1.677, 1.066, 0.282, -2.068, -0.044, 0.674, -0.567, 0.413, 0.217, 0.609, 1.327, -0.371, -0.567, 0.739, -2.068, 0.217, 0.086, 0.282, -2.003, 0.543, 0.478, -1.546, -2.525, 0.086, 0.413, 0.609, 0.021, 1.262, -0.762, 1.588, -1.35, 0.347, 0.152, -0.632, 0.805, -1.22, 0.478, 1, 0.87, -2.656, 0.674, -0.893, -0.632, 0.935, 0.674, -0.893, 0.674, -0.044, -1.481, -2.591, -0.697, -0.958, 0.674, 1.262, -2.068, -2.591, 0.217, -0.828, 0.152, 0.282, -1.154, 0.87, 1.196, -2.46, -0.24, -0.893, 1, 0.739, 0.413, 0.282, 0.805, 0.021, 0.021, 1.066, 0.347, 0.413, -0.567, 0.674, 0.739, 0.87, 0.478, 1.784, -0.697, 0.086, -1.024, 0.935, 0.021, 0.478, -0.305, -0.501, -1.415, -1.807, 0.478, 0.282, -2.525, 1.066, 0.805, 0.935, 0.674, -2.003, -0.893, 0.282, -1.35, 1.523, -0.762, 1.066, -0.436, 0.805, 0.347, 0.935, 0.347, -0.11, 0.805, -0.305, -1.677, 0.021, 0.347, -0.24, -1.285, -0.371, 1.196, -0.24, 0.478, -0.371, 0.021, -0.11, -0.697, 0.021, 0.086, 0.021, 0.347, -1.807, -0.305, -2.134, 0.739, 0.674, -0.371, 0.739, 1.196, 1.262, 0.739, -0.632, 0.021, 0.674, -0.436, -0.11, -2.329, -1.024, -2.068, 1.653, -0.11, -0.24, 0.478, -1.938, 0.347, -0.697, 0.87, 0.674, 0.086, 0.543, -1.807, 1.719, 0.152, -1.154, 1.066, -0.371, 0.086, 1.327, -1.546, 0.609, 0.87, 0.152, 1, 1.131, -0.24, 0.935, -1.415, 1.131)
fifaRaw <- c(fifaRaw, 0.543, 0.609, 0.021, 0.935, 0.021, -0.632, -2.656, 0.478, 0.413, 0.609, 0.347, -0.305, -2.656, -0.893, -2.068, 0.478, -0.436, -0.501, 0.217, 0.543, 1.327, 0.217, 0.674, 0.217, 0.021, -0.501, 0.021, 0.282, 0.739, 0.87, -0.436, 0.152, -0.11, 0.935, -0.175, -0.958, -1.546, -0.24, 1.457, -0.11, 0.152, -0.501, 0.347, 0.674, 0.543, 0.217, 0.347, 0.086, 0.609, -0.24, -1.742, 0.021, 0.347, 0.935, 0.609, 0.347, -1.089, -1.024, -0.567, 0.805, 0.935, -0.567, 0.543, -1.154, -0.11, 1.653, 0.609, 0.217, 0.805, 0.413, 0.282, -0.632, 0.935, 1.653, 1.457, -1.938, -0.11, 0.086, 0.086, 0.674, 0.543, -1.154, 1.292, 0.086, 0.823, -2.057, -0.048, -1.789, -1.186, 0.019, 1.627, 0.421, 0.153, 0.287, -0.048, 0.555, 0.555, 0.019, -2.057, 0.823, 0.555, 0.823, -1.119, 0.22, -1.588, -0.45, 0.488, -2.325, 1.091, 0.488, 0.019, 0.622, -0.584, -0.249, 0.756, 0.086, -1.588, 0.287, 1.493, 0.354, 0.019, -0.249, 0.622, 0.153, -2.057, 0.756, 1.493, 1.56, 0.555, 0.086, -2.66, -0.316, -0.182, 1.895, 0.22, -1.454, -0.784, -1.32, 1.359, -0.784, -1.655, 0.756, 0.488, 0.287, 1.091, -0.249, 0.756, 0.287, 0.622, -0.584, 0.019, 0.756, 0.823, 0.019, 0.354, -0.115, 0.153, -0.985, 0.354, 0.756, -0.918, 0.354, -0.918, 0.019, 0.756, 0.354, -1.588, 0.354, 0.019, 0.421, -0.048, 0.086, -0.182, 0.22, 1.895, -0.383, 1.024, -2.124, 0.153, 1.627, 0.689, 1.56, 0.488, 0.287, 0.823, 0.555, 0.89, -0.784, 0.22, 0.89, 0.89, -0.249, -2.459, -0.048, -0.584, -2.861, -0.383, 1.426, 0.22, 0.689, 0.823, 0.89, -0.048, 0.153, -1.521, 0.957, -1.789, -0.115, 0.555, 0.153, -1.119, -0.249, 1.091, 0.354, 1.024, 1.158, -2.124, -0.182, 0.153, -1.655, -0.249, -1.119, 0.555, 0.488, 1.694, 0.622, 0.287, 1.024, 0.555, -1.387, 0.89, -0.249, -1.253, 0.421, 0.488, -1.588, -0.918, 0.957, 0.89, -0.985, 0.22, 0.555, 0.555, 1.292, 0.756, 0.287, -0.517, 1.158, -0.851, -0.985, -0.918, -1.387, -0.182, 1.962, 0.488, -1.454, 0.756, 0.287, 1.359, 0.823, 0.153, 0.086, -0.115, -2.794, 0.421, 0.823, 0.823, 1.091, -0.048, -0.182, 0.153, 0.354, -0.784, 0.555, -2.593, -1.253, -0.115, -0.45, 0.823, 0.622, 0.287, -3.129, -0.918, -0.115, 0.555, 0.756, -0.182, 0.019, 1.761, -2.325, 1.158, 1.292, 0.756, -1.655, 0.354, -1.32, 0.957, 0.823, 0.89, -0.851, 0.421, 0.22, 0.153, 0.019, 0.22, -1.789, 1.225, -1.387, 0.488, 1.493, 1.225, -2.258, 0.756, -0.45, 1.56, 0.22, -0.985, 0.957, 0.153, -2.459, 0.153, 0.22, -1.856, 0.823, 0.354, -0.651, 0.22, 0.488, -0.048, 0.421, -0.784, 0.957, 1.225, -0.249, -0.249, 0.689, -2.124, 0.823, 0.622, 0.153, -2.526, -0.115, 0.354, -1.119, -2.258, 0.421, 0.488, 0.555, 0.153, 1.426, -0.784, 1.024, -1.052, 0.421, 0.689, -1.387, 0.689, -0.718, 0.22, 0.622, 0.823, -2.526, 0.622, 0.019, -0.316, 1.158, 0.823, -0.918, 0.823, 0.22, -1.32, -2.459, -0.584, -0.985, 0.689, 1.359, -0.383, -2.727, 0.354, -0.048, 0.354, 0.086, -1.052, 0.823, 1.56, -3.263, 0.421, -0.182, 1.024, 0.555, 0.287, 0.354, 0.622, 0.153, -0.115, 0.22, 0.421, 0.488, -0.918, 1.091, -0.182, 1.158, -0.584, 1.694, -1.052, 0.22, -1.588, 0.756, 0.019, 0.689, -0.115, -0.517, -0.784, -1.789, -0.182, 0.488, -1.722, 1.024, 0.622, 1.091, 0.622, -2.258, -1.253, 0.622, -0.651, 1.426, -1.387, 0.823, -0.249, 0.555, 0.555, 0.689, 0.354, 0.019, 1.091, -0.115, -1.454, -0.249, 0.287, -0.115, -1.655, -0.584, 1.426, -0.45, 0.019, -0.048, 0.287, 0.354, -0.718, 0.153, -0.249, 0.086, 0.22, -0.784, -0.784, -2.392, 0.555, 0.421, -0.651, 0.756, 1.225, 1.426, 0.689, -1.32, 0.756, 0.086, -0.718, -0.784, -1.856, -1.99, -1.186, 1.761, 0.689, -0.316, 0.555, -2.191, 0.957, -0.048, 0.957, 0.89, -0.383, 0.354, -1.856, 0.957, 0.287, -1.454, 0.823, -0.851)
fifaRaw <- c(fifaRaw, 0.153, 0.89, -1.119, 0.488, 0.823, -0.182, 1.024, 0.488, -0.048, 1.024, -1.588, 1.426, 0.555, 0.622, -0.182, 0.89, -0.45, -0.249, -2.861, 0.488, 0.622, 1.024, -0.651, 0.488, -2.526, -0.182, -2.191, 0.354, -0.651, -0.115, 0.287, 0.421, 0.823, 0.488, 0.622, -0.115, -0.182, -0.651, -0.316, 0.354, 0.287, 1.024, -0.651, 0.421, -0.584, 0.689, -0.249, -0.182, -1.253, -0.383, 1.225, -0.115, 0.354, -0.115, 0.622, 1.024, 0.153, 0.421, 0.22, -0.316, 0.622, -0.517, -1.32, -0.048, 0.89, 1.091, 0.287, 0.22, -0.918, -1.186, -0.918, 0.756, 0.756, -0.115, 0.89, -0.584, -0.651, 1.627, 0.756, 0.019, 0.354, 0.22, -0.249, -0.584, 1.225, 1.627, 1.627, -1.99, -0.383, 0.22, -0.316, 0.89, 0.689, -0.918, 1.582, 0.508, 1.247, -2.515, -0.298, -1.709, -0.366, 1.045, 0.306, -0.097, -0.567, 0.441, 0.575, 0.172, 0.776, 0.373, 0.037, 1.65, 0.978, 0.709, -0.97, 0.105, -0.164, -0.164, 0.239, -1.373, 0.642, 0.441, 1.247, 0.911, -1.507, 0.306, 0.239, 0.373, -1.507, 0.172, 0.306, 0.306, -0.836, 0.105, 0.239, -0.164, -2.717, -0.701, 0.844, 1.717, -0.164, -0.231, -1.843, 0.844, 0.776, 1.314, 0.172, -0.97, -0.836, -2.179, 1.65, -0.231, -1.642, 0.844, 0.508, -0.097, 1.381, 0.105, 0.306, 0.037, 1.045, -1.104, 0.373, 0.441, 1.112, 0.709, 0.709, 0.508, 1.045, -3.254, 1.045, 0.642, 0.172, -0.903, -1.642, -0.366, 1.515, -0.164, -1.172, 0.642, -0.298, 0.642, -1.44, 0.037, 0.037, -0.231, 1.784, 0.239, 1.448, -1.104, -1.037, 2.053, 0.709, 1.582, 0.172, -0.164, 0.508, -0.903, 0.508, -2.112, 0.172, 1.045, 0.306, 0.441, -2.112, -0.433, 0.978, -2.045, -1.44, 0.844, 0.239, 0.844, 0.642, 0.306, 0.776, 0.441, -0.97, 1.247, -1.239, -1.037, -0.634, 1.515, -0.567, -1.507, 1.045, -0.97, 0.105, 1.314, -1.642, 0.239, 0.508, -0.164, -1.239, -0.567, 0.575, -2.246, 1.784, 0.575, 0.373, -0.03, 0.911, -0.97, 0.776, 0.575, -0.298, 0.844, 0.306, -0.433, -0.634, 1.247, 0.709, -0.366, 0.239, -0.164, -0.164, 0.373, 0.373, 0.105, -0.701, 0.978, -0.97, -1.507, -1.575, -2.112, -0.231, 1.784, -0.298, -0.366, 0.239, 0.172, -0.164, 0.776, -0.5, 0.642, 0.508, -2.246, 0.239, 0.037, 1.784, 0.776, 0.105, -0.164, 0.105, -0.097, -0.231, 0.373, -2.582, -1.978, -0.903, 0.709, 0.978, 0.508, -0.097, -1.373, -0.769, 0.105, 0.508, 0.239, 0.911, 0.105, 0.844, -1.172, 0.844, 1.381, 0.776, -1.642, 0.642, 0.239, 0.508, 0.844, 1.247, -0.903, 0.575, -0.298, 0.844, -0.03, 0.911, -2.112, 0.306, -1.642, 1.179, 1.448, 1.851, -2.717, -0.366, -1.44, 0.776, 0.373, -1.709, 0.575, 0.373, -2.582, -0.03, 0.575, -1.373, 1.112, -0.03, 0.844, -0.231, 0.373, -0.634, 0.373, -0.164, 0.306, 1.045, 0.441, 0.373, 0.709, -0.836, 0.037, -1.911, 0.373, -2.045, -0.231, 1.717, -1.776, -1.978, 0.575, 0.037, 0.642, 0.978, 1.045, 0.239, 0.373, -0.433, -0.433, -0.097, -1.843)
fifaRaw <- c(fifaRaw, 0.441, -0.433, 0.776, 1.247, 1.045, -2.381, 1.179, -0.366, -0.366, 0.844, 0.978, 0.373, 0.037, 0.441, -0.164, -2.851, 0.508, 0.105, 0.978, 0.441, -0.03, -2.314, -0.836, -0.298, -0.231, -0.836, -1.978, 0.911, 0.844, -1.575, 0.037, -0.097, 1.314, 1.112, 0.037, 0.239, 0.978, -0.97, -1.306, 1.515, 0.575, 0.642, 0.306, 1.112, -0.298, 0.373, 0.508, 1.381, -1.172, 0.441, -1.507, 1.515, 0.844, 0.306, -0.03, -1.172, -1.978, -2.314, -0.366, 0.172, -1.911, 0.037, 0.441, 0.911, 1.247, -1.642, -1.642, 0.373, -0.701, 0.978, 0.105, 0.978, -0.836, 0.776, 0.508, 0.642, 0.239, -0.567, 1.381, 0.642, -1.575, 0.306, 0.508, -0.5, -1.104, -0.5, 1.112, -0.097, 0.575, 0.239, 0.306, 0.239, -0.5, 0.172, 0.373, -0.231, 0.239, -1.172, -0.903, -1.978, 1.381, -0.298, 0.105, 0.239, 1.045, 1.448, 1.247, -0.5, 0.441, 0.642, -0.5, -0.298, -1.911, -0.836, -2.045, 1.247, -0.097, -0.5, 0.441, -2.448, -0.567, -0.567, 1.112, 0.441, 0.105, 0.306, -1.575, 1.851, -0.433, -0.298, 0.911, 0.373, 0.373, 0.978, 0.172, 0.172, 0.709, -0.03, 1.247, 2.053, -0.634, 0.373, -0.903, 0.441, 0.105, 0.709, 0.508, 1.314, 0.844, -0.097, -1.709, 0.575, 0.911, 0.776, 0.642, 0.373, -2.515, -0.298, -1.776, -0.231, 0.844, -0.231, -0.03, -1.239, 1.112, 0.172, 0.575, 0.575, -0.366, -0.836, -0.903, -0.164, 0.978, 0.978, 0.105, -1.037, 0.105, 1.112, 0.105, -0.298, -1.709, -0.366, 0.776, -0.231, 0.037, -0.567, -0.836, 0.642, 0.776, 0.239, -1.172, -0.164, 0.441, 0.306, -1.978, -0.5, 0.172, 0.776, 0.978, -1.44, -0.231, -0.903, -1.575, 1.045, 1.381, -1.239, 0.239, -0.567, 0.642, 0.844, -0.164, 0.441, 0.575, 0.105, -0.298, -0.567, 0.306, 1.784, 1.247, -1.642, 0.105, -0.164, 0.709, 1.179, -0.03, -1.776, 0.662, -0.582, 0.21, -0.921, -1.034, -0.017, -0.356, -1.599, -0.017, 0.21, -1.938, 0.662, -0.356, 0.323, 1.34, -1.034, 1.453, 2.018, 0.662, -0.921, -0.921, 0.662, 0.323, -1.599, -0.582, -0.243, -0.13, -0.243, 1.114, 1.453, -0.695, -0.243, -0.356, -2.164, 1.905, 1.114, -0.695, 1.34, 0.323, 1.227, 1.227, 0.097, 0.323, -0.017, 1.34, 0.775, 0.662, -0.017, 1.114, -1.034, 1.34, 0.436, -0.921, 1.114, 0.436, 1.001, 0.549, -0.243, -0.582, -0.017, -0.243, 1.453, 0.323, 1.001, -0.243, 0.21, -0.808, -0.808, -1.712, -0.13, -0.582, -0.243, -0.017, 3.035, 0.097, 0.888, -0.243, 0.662, 0.323, 0.662, 0.436, -1.26, 0.549, -0.356, 1.114, 0.436, -1.373, -1.034, -1.486, 1.453, 0.775, -0.582, -0.017, -0.469, -0.582, 1.679, -1.486, 0.323, -0.017, -0.13, -0.13, 0.097, 0.323, -2.164, -0.356, -2.051, -0.808, -1.486, 0.549, 0.436, -2.051, 0.21, 1.679, -2.051, -0.469, 1.453, 0.097, -3.068, -0.13, -0.017, 0.549, 0.888, 0.436, 0.662, 0.097, -2.39, 1.227, 1.679, 0.662, -1.938, -0.017, -1.034, -0.582, 0.21, -0.017, -0.469, -2.503, 0.888, -0.808, 0.323, 0.436, -0.469, 1.34, 0.775, -1.938)
fifaRaw <- c(fifaRaw, 0.775, 0.662, 1.453, 1.001, -0.808, -0.582, 0.21, -0.695, 0.097, 1.453, 0.888, 0.436, 0.662, -0.017, 0.21, 0.436, 0.549, -0.582, 0.21, 0.549, 0.21, -0.808, 0.323, 0.549, 0.775, -0.808, 1.114, -0.243, -0.582, -0.469, 0.097, -0.017, 0.21, -0.582, 0.097, -0.469, -2.277, -0.582, -0.017, -0.921, -0.808, 0.549, -0.469, -0.695, -1.034, 0.436, 1.114, -1.599, -1.034, -1.26, 0.549, 0.436, -0.017, 1.566, -1.147, -0.017, -1.147, 0.323, 0.097, 0.21, -0.921, 0.323, 1.34, 0.323, 0.549, -0.243, 0.436, 0.549, 0.097, 2.018, -0.469, 0.775, -0.017, -0.695, -1.034, 0.549, -1.599, -1.373, 1.227, 1.114, 0.097, -0.13, -0.13, -0.356, -1.825, 0.21, -0.017, -0.243, 1.114, 0.662, 0.323, 0.888, -1.147, 1.114, -1.034, 1.001, -0.13, 0.549, -0.017, 0.549, -1.034, 0.549, -0.695, -0.921, -0.695, 1.227, 0.662, -0.582, 0.097, 0.323, -0.921, 0.549, 0.323, -3.633, -0.356, -2.051, -1.373, -1.599, -0.582, -0.921, 0.21, 0.323, 0.21, 2.583, 0.21, -1.825, 1.566, 1.114, 0.775, -1.486, -1.373, -0.921, 0.21, 1.792, -1.034, -0.017, 1.792, 0.549, 1.114, 1.114, 0.097, 1.679, 1.453, 2.357, -0.695, 0.549, 1.453, 1.114, -1.938, 0.097, -1.147, -1.034, 0.775, 0.549, -0.469, -0.921, 1.905, -0.469, 0.097, -0.582, -0.13, 0.775, 1.453, -0.243, 0.436, 0.21, -0.469, 0.21, 1.566, -0.243, -2.051, -0.695, 0.775, 0.888, -0.469, -0.469, -0.017, 0.775, -0.582, 0.888, 2.131, -0.695, 1.453, 0.775, -1.373, -0.356, -0.017, -0.582, -1.938, 0.549, -0.695, 0.323, -0.017, 0.775, -2.503, -0.695, 0.888, -0.243, -0.921, -0.017, 0.21, -1.26, 0.097, 0.662, 0.21, 1.114, -1.147, 0.436, 0.662, 0.21, -1.486, -0.469, 0.436, 0.323, -1.712, 1.566, 0.662, -1.486, 0.775, 0.21, -0.469, 0.888, 1.566, 0.097, 0.323, -0.243, -0.017, -1.486, 0.323, -1.034, -1.26, -0.582, 1.566, 0.549, -1.825, 0.21, -0.469, 0.549, -0.243, -0.017, -0.695, 0.21, -0.13, -0.243, -0.017, -1.825, 0.662, 0.097, -0.921, 1.227, 0.662, 0.323, 0.775, 1.34, -2.503, -1.825, 1.566, -0.017, -0.582, -0.13, 0.21, -0.582, 1.34, -0.469, -0.243, 0.549, -0.695, 0.662, 0.775, 0.888, 0.775, -1.373, 0.549, 0.323, 2.018, -0.13, -0.469, 0.21, 0.662, -0.017, 0.323, 0.662, 0.323, 0.323, -0.695, -2.503, 0.097, 1.227, -1.712, 0.21, -0.243, 1.453, 0.097, 0.888, 1.227, 1.001, -0.356, -0.695, 1.001, -0.808, 0.097, -0.582, 1.34, 0.21, -0.921, 0.21, 0.775, 1.453, 0.549, 0.436, -1.938, -0.921, -0.243, 1.227, -1.147, -0.582, 0.323, 0.323, 1.114, -0.808, -0.582, 0.21, 0.21, -0.13, -0.921, 1.227, -0.921, -0.017, -1.26, -0.017, -0.017, -0.808, 2.583, 0.775, -1.486, 1.227, -0.582, 1.227, -1.938, -0.695, 1.453, 0.888, 0.436, -0.582, -2.051, -2.277, 1.114, -0.356, -0.356, 2.018, -1.712, -0.243, 0.888, -0.469, -1.147, 1.737, 0.969, 1.249, -0.705, 0.341, -1.403, -0.008, 0.69, -0.426, 0.969, 1.109, 0.551, 0.969, -0.008, 1.109, 0.621, -0.217)
fifaRaw <- c(fifaRaw, 1.667, 0.411, 1.179, -1.333, 0.621, -0.077, 0.132, -0.356, -1.264, 0.551, -0.356, -0.077, 1.039, -1.822, -0.775, 0.062, -1.473, -2.031, 0.83, 0.9, 0.272, -0.636, 0.411, -0.775, -0.147, -1.194, 0.132, 0.9, 1.458, -0.915, 0.132, -2.31, -0.008, 0.551, 0.341, 0.551, -1.543, -0.845, -0.705, 1.877, -0.217, -2.45, 1.179, 0.83, 0.76, 0.83, 0.411, 1.039, -0.636, -0.217, 0.76, 0.76, 0.83, 1.388, 1.109, 0.9, 0.551, 0.9, -1.403, 1.249, -0.147, -0.147, -0.217, -1.264, -0.636, 1.109, -1.822, -1.264, 0.411, -0.008, 1.388, -2.101, 0.411, -0.147, 0.272, 0.969, 0.272, 1.179, -2.45, -0.077, 2.156, 0.132, 1.039, -0.566, 0.621, 0.83, 0.062, 0.062, -2.171, 0.062, 0.83, -0.008, -0.008, -2.729, -0.356, 0.969, -1.054, -0.496, 0.83, -0.496, 0.341, 0.481, 0.341, 0.76, 0.969, -1.752, 0.9, -2.589, -0.496, -1.822, 2.086, -1.194, -1.752, 0.341, 0.76, -0.356, 1.039, -2.31, 0.83, 0.341, -0.636, -0.426, 0.202, 0.481, -2.101, 1.737, 1.179, -0.077, -0.426, 0.9, -1.124, 0.621, 1.249, -0.426, -0.496, -0.636, -0.984, -0.147, 0.76, -0.147, -0.845, -0.426, 0.341, -0.147, -0.356, 0.132, 0.272, -0.705, 0.9, -0.566, -1.473, -1.961, -2.31, -0.915, 1.458, 0.969, -0.217, 1.109, -0.217, 0.83, 1.039, 0.551, 1.249, 1.179, -2.589, 1.179, 0.411, 1.528, 0.202, 0.062, -0.147, 0.411, 0.551, -0.287, -0.077, -2.171, -2.171, 0.202, 0.9, 0.202, 0.76, 0.969, -2.938, -0.496, 0.202, 0.9, 0.341, 0.551, 0.621, 0.062, -2.031, 0.76, 0.969, 0.341, -0.287, -0.008, -0.496, 0.69, 0.341, 0.062, -1.682, -0.287, 0.551, 0.411, 0.062, 1.877, -1.892, 0.202, -1.194, 0.69, 1.318, 1.877, -1.403, -0.008, -1.124, -0.008, 0.341, -0.147, 1.318, -0.147, -2.799, -0.077, -1.543, -1.613, 1.039, 0.132, -0.426, -0.426, 0.481, -1.264, 0.341, 0.9, -0.915, 0.969, 0.132, 0.062, 0.621, -0.636, -0.845, -0.287, 0.551, -2.45, 0.9, 0.551, -0.984, -2.589, 0.411, -0.287, 1.179, 0.132, 1.039, -0.636, 0.551, -0.984, -1.403, -0.356, -0.775, 0.062, -1.892, 0.69, 0.83, 1.039, -2.31, 0.551, -0.845, -0.356, 0.411, 0.551, 1.109, 0.132, -0.287, -0.287, -1.752, 0.551, -0.217, 0.9, 0.132, -0.426, -0.984, -0.147, -0.775, 0.969, -0.566, -2.38, 0.132, -0.008, -1.333, -0.426, -0.147, 1.877, 1.179, 0.272, 0.272, 0.481, -1.124, 0.202, 2.086, -0.566, 1.597, 0.551, -0.636, 0.9, 0.062, -0.217, 1.318, -0.984, 0.202, -2.241, 1.528, 1.318, 0.062, -0.566, -0.496, -0.147, -0.775, 0.76, 1.458, -1.752, -0.356, 0.621, 0.341, 1.318, -1.194, -0.077, -0.287, -0.147, 0.621, -0.984, 0.9, -0.705, -0.426, 0.202, 1.179, 0.202, 0.341, 1.318, 0.411, -1.124, 0.551, 0.69, -1.333, -1.822, 0.341, 1.179, -0.008, -0.077, 0.272, 0.9, 0.132, -0.147, -0.147, 0.481, -0.287, 0.76, -1.264, -0.077, -2.171, 0.969, 1.109, 0.481, 0.272, 1.388, 1.388, 0.621, -0.217, 0.551, 1.528, -0.077, 0.132, -1.752, -0.705, -2.171, 0.411, -1.682, -0.287)
fifaRaw <- c(fifaRaw, 0.272, -1.264, -0.845, -0.775, 1.109, 0.202, -0.287, 0.341, -1.333, 1.597, 0.969, -1.403, 0.272, 0.551, -0.566, 1.737, -0.566, -0.077, 1.039, -0.636, 1.318, 2.226, -0.496, 0.062, -0.636, 0.341, 0.411, 0.481, 0.69, 1.877, 0.481, -0.984, -0.775, 0.272, 1.039, 0.132, 0.551, -0.287, -3.008, -0.356, -0.915, -0.077, 1.318, 0.132, -0.356, -1.054, 0.621, -0.217, 0.062, -1.194, -0.356, -1.054, -0.775, -0.287, 1.109, 1.109, -0.496, -0.077, -0.008, 0.83, 0.411, -0.915, -1.752, -0.077, 0.062, 0.76, 0.341, 0.551, 0.481, 0.621, 0.481, -0.636, -0.287, 0.9, 0.062, 0.83, 0.132, -1.054, -0.705, 1.039, 0.69, -0.356, -0.356, -0.217, -1.613, 0.202, 1.458, -1.054, -0.356, -0.566, 0.481, 0.132, 0.969, 0.202, 1.249, 0.272, -0.426, -0.217, 1.039, 1.109, 0.76, -1.961, -0.077, -0.705, 1.528, 1.249, 0.132, -0.636, 1.18, 0.573, 0.628, -1.854, -0.144, -1.964, -0.695, 0.187, 0.297, -0.199, -0.034, 0.518, 0.408, 1.235, 0.849, 0.077, 1.125, 1.29, 0.959, -1.909, -1.964, 0.573, -1.633, -1.909, 0.242, -1.909, 0.959, 0.573, 1.07, 0.683, -0.806, -1.192, -1.192, -0.971, -1.633, 0.573, -0.034, -1.247, -0.53, 1.014, -0.089, 0.518, -1.192, 0.077, 0.132, 0.408, 1.29, 0.353, -1.743, -0.144, 1.07, -0.364, -0.144, -1.688, 0.739, -1.743, 1.235, -0.144, -1.854, 0.683, 0.959, -0.695, 0.904, -0.144, 0.132, 0.628, -0.144, -0.585, 0.022, 1.621, 0.408, 0.904, 0.022, 1.787, 0.242, -2.019, 0.849, 0.959, -0.971, 1.235, -0.144, -0.916, 0.187, -0.53, -1.688, 1.014, -1.302, -1.743, -0.751, 0.518, -0.089, 0.573, 0.297, 0.242, 0.518, 1.125, -0.806, 0.794, -1.523, 0.849, 0.518, 0.628, 0.132, -1.137, -0.916, -0.916, -1.412, 0.353, 0.022, 1.456, -1.798, -0.585, 0.077, -1.688, -0.806, 1.014, 0.739, -0.585, 1.014, 0.849, 0.518, 1.345, -1.633, 1.4, -1.909, -1.081, -0.695, 0.297, 0.132, -2.129, 0.022, -0.199, -1.026, -0.089, 0.794, 0.628, 0.739, -1.688, -0.585, -1.302, 0.683, 0.242, 1.566, 0.904, 0.132, 0.353, 0.077, -1.633, 0.904, 0.132, 0.187, 0.408, 0.794, 0.022, 0.904, 0.132, 0.959, 1.511, 0.959, 1.125, 0.683, 0.904, -0.144, -0.916, -1.743, 0.518, 0.628, -1.688, -1.798, -1.743, -0.089, 0.849, -1.468, -1.081, 0.242, 0.518, 0.353, 0.463, 0.187, 0.187, -0.199, -1.688, 0.628, -0.364, 0.573, 0.849, 1.29, -1.081, -0.254, 0.187, -1.523, 0.739, -2.074, -1.854, -0.695, 0.408, 0.187, -1.026, -0.034, -2.129, 0.739, -0.254, 0.849, -0.916, 0.739, -0.199, 1.125, 1.345, 1.4, 0.959, 0.849, -0.034, -0.475, 0.077, 1.511, -0.254, 0.849, -0.751, -1.247, -1.578, 0.353, -0.53, 0.297, 1.235, 1.621, -0.53, 0.022, 0.573, 0.518, -1.743, -0.475, 0.463, 0.794, 0.959, -2.019, 0.408, 0.959, -1.743, -0.144, -0.034, -1.854, 0.904, 1.18, 0.739, 1.29, 0.187, -0.144, -0.309, -1.798, 0.518, 0.849, 0.297, -0.695, 0.794, -0.916, 0.463, -0.089, 1.125, -1.743, -0.089, -0.751, -0.585)
fifaRaw <- c(fifaRaw, -1.909, -0.034, 0.959, -1.026, 0.628, -0.53, -0.089, -0.199, 0.683, 0.959, 1.235, -1.909, 0.297, -0.64, -0.475, 0.959, 0.849, -2.35, -0.53, 0.904, -1.743, 0.408, 1.125, 1.125, 0.739, 1.18, 1.456, -1.743, 0.297, 1.125, 1.125, -0.199, -2.129, -1.909, -0.254, 0.628, 1.07, 0.628, -1.688, 1.621, -0.254, -2.185, 0.187, -0.751, 0.959, 1.4, -0.861, 0.408, 0.573, 0.518, -0.64, 0.739, 0.739, -0.53, -0.64, 1.07, 0.959, -0.034, -1.357, 0.904, -0.806, 0.022, 1.07, 1.29, 0.022, 1.456, 0.959, -0.916, 0.849, -1.909, 0.573, 1.125, -1.743, 0.573, -1.247, -1.302, 0.408, -2.019, 0.573, -0.199, 0.794, 0.739, -0.64, 0.518, -1.026, -0.42, 0.408, 0.022, 0.904, -0.254, 0.683, 1.07, -1.633, -0.254, 0.187, -0.64, -1.412, -0.254, 0.794, 0.518, -0.364, -0.53, 1.125, 0.077, 1.18, -0.034, 0.959, 0.849, 0.463, -2.074, 0.077, -1.854, 0.573, -1.688, 0.683, 0.408, 1.235, -0.199, 0.463, -0.806, -0.53, -0.089, 0.628, -0.806, -1.854, 1.787, 0.959, 1.125, -1.357, -0.42, -0.089, -1.743, 0.849, 0.739, 1.621, 0.959, 0.463, -1.357, -1.964, 1.07, 0.408, 0.022, 1.125, 1.235, 0.408, 0.904, -1.798, -1.412, 1.125, -1.302, 0.739, 0.959, -0.695, -0.475, 0.022, 0.739, 0.794, 1.29, 0.904, 0.187, 0.628, 1.014, -2.129, -0.089, 0.683, 0.353, 0.683, 0.297, -1.909, 0.683, -1.854, -0.254, 1.4, 0.683, 1.345, -0.254, -0.089, 0.628, 0.408, -0.034, -0.53, 1.07, -0.806, 0.463, -0.144, 1.07, -1.633, 0.463, 0.794, 1.125, 1.07, -2.24, 0.408, 0.242, 0.187, 0.187, 1.511, -0.585, -0.364, 0.739, 0.573, 1.29, -0.695, -0.585, 0.739, 0.077, -1.854, 0.408, 1.18, -0.144, 0.683, -0.53, -0.53, -1.633, -2.074, 1.345, 0.739, -0.695, 0.573, -1.081, 0.242, 0.408, -0.254, 1.345, 0.959, -0.254, 0.353, -0.144, -0.309, 1.29, 0.463, -1.357, 1.235, -0.089, 0.573, 1.014, -0.089, -0.916, 0.881, -0.454, 0.714, -2.038, -0.454, 0.714, 0.881, -1.204, -1.037, -0.704, -0.787, 0.63, -0.787, 0.38, -1.121, -0.203, 0.13, 0.047, 0.13, -0.203, 0.213, -0.12, -0.037, -0.871, 0.714, -0.037, 0.213, -0.037, 0.047, 0.213, -1.204, -0.62, -0.871, 0.297, -0.037, 1.298, -0.287, 1.881, -1.204, 1.381, 0.63, 0.213, 0.13, -1.538, 1.131, 0.464, 0.881, -1.121, -0.954, 0.297, -2.372, -2.288, -1.288, -0.12, 0.047, 0.547, 1.464, 0.881, -1.288, -0.203, -1.288, 0.797, 0.797, 0.213, -0.454, -0.454, -0.62, 0.213, -1.204, -0.704, 0.38, 1.298, 0.38, -2.955, 0.714, -1.621, -0.537, 0.047, 0.464, 0.547, 0.13, 0.714, 1.298, -1.371, 0.38, 0.213, 0.38, -0.871, -0.287, 1.798, 1.381, 1.047, 1.214, -0.537, 0.881, -2.872, 0.797, 2.131, 1.464, 0.797, -0.037, -1.371, 0.38, 1.131, -0.62, 0.38, -0.287, -0.62, 0.714, 0.797, -1.955, 1.298, 0.714, -0.62, 0.797, 0.047, -0.203, -0.704, 1.214, 0.714, 0.964, -0.12, -2.288, -0.954, -0.287, 0.547, 0.047, -0.12, 1.214, 0.13, -0.287, 0.881, 0.714, 1.131, 0.38, 0.464, -0.287, 0.213)
fifaRaw <- c(fifaRaw, -0.037, 1.965, 0.547, -0.787, -2.288, 0.63, 0.714, 0.797, 1.548, -0.203, 0.797, -0.12, -0.12, 1.464, -0.287, 0.63, 0.213, -0.203, -0.954, -1.454, -0.037, 0.213, -0.037, -0.037, 1.131, 1.548, 0.213, -1.538, -0.287, -0.12, -0.037, -0.954, -1.121, -1.121, -0.704, -0.203, -0.37, 0.547, -1.705, 1.047, -0.203, 1.464, 0.547, -2.538, -1.204, 0.547, 2.048, -0.037, -0.037, 1.381, -0.454, -0.37, 0.797, -1.121, -1.121, -0.704, 0.63, -0.37, -0.287, -0.787, -1.288, -2.455, 0.714, -0.871, -0.12, 0.38, 2.215, -0.871, 1.298, -0.954, 0.13, -0.787, 0.714, 1.214, -0.037, -0.454, -0.203, 0.213, 0.797, -2.705, -0.37, -0.287, 1.214, -0.037, -0.37, 0.38, 0.547, 0.213, -2.038, 0.464, -1.037, -0.537, -0.537, 0.63, 0.297, 1.131, 0.714, -1.371, -1.955, -2.955, 0.63, 0.38, -0.871, 0.797, -0.871, -0.037, 0.881, -0.287, -0.37, 0.047, -0.871, -2.205, -0.537, -0.37, -0.203, -0.037, -2.789, -0.787, 0.38, -1.204, -1.454, 0.797, -0.871, 0.047, -2.038, -0.037, -0.871, -0.203, 2.048, 1.631, -1.204, -0.537, 0.047, 2.215, 0.297, -0.37, -0.203, 0.881, -1.121, -0.704, 1.298, -2.372, 1.965, -0.62, 0.213, -0.287, 1.965, 0.464, 0.38, -0.287, 0.464, -1.121, 0.547, 0.213, 0.13, 0.547, -0.704, -0.454, -0.704, 0.13, 0.547, -2.121, -1.204, -0.62, -0.704, -0.537, -0.12, 1.381, 1.798, -1.454, 1.464, -0.704, 0.964, -0.037, 0.464, -1.037, 0.38, -0.537, 0.38, 0.297, -2.705, -0.037, -0.203, 0.13, -0.037, -0.203, 0.881, -0.287, -0.037, 2.048, 1.047, 0.797, -0.203, -0.704, -1.621, -2.288, 0.047, 0.797, -0.203, 0.797, 0.13, -0.12, 0.297, 1.214, 1.798, -0.787, -2.372, -1.121, 0.547, 1.548, 0.38, 0.547, 0.714, -0.203, 0.13, -0.871, 0.714, -1.037, 0.63, 0.464, 0.797, 0.13, 0.464, 0.63, 0.547, 1.214, -0.537, 0.047, 1.131, 1.047, 0.714, 0.964, -1.621, -0.454, -0.287, -1.371, -0.454, -0.203, 0.464, 0.881, 0.13, 0.13, 0.047, 0.714, 1.464, -1.121, 0.797, -0.12, -0.954, 1.464, 0.213, 1.131, -0.871, 0.881, 1.047, -0.787, -0.871, 0.38, -0.62, 1.047, 0.38, 0.464, -2.372, 0.547, -0.537, -0.12, 0.881, 0.464, -0.37, 0.297, 0.63, -0.787, 0.714, 0.213, -1.037, 0.797, 1.214, 0.464, 0.297, 0.63, -0.12, 0.63, 0.714, 1.965, -0.537, 0.547, -0.954, -1.371, -0.287, 1.047, -0.287, -0.871, -3.289, 0.63, 0.213, -0.62, -1.204, 0.63, 0.547, 1.548, 0.547, 1.548, -0.203, 0.464, 0.881, -1.871, 0.213, 1.131, 0.38, -0.037, -0.454, -1.037, 0.38, -0.704, 0.714, 0.714, -0.62, 0.213, -0.12, -0.37, 0.714, -0.454, -0.454, -0.287, 0.714, 1.298, 0.797, -0.537, 1.298, -2.372, 0.13, 0.047, -0.037, -2.622, 1.381, 1.381, 0.464, 0.547, 0.047, 2.298, 0.714, 1.047, 1.298, 0.38, 1.381, -2.622, -1.204, 0.881, 0.714, -1.288, 0.547, 1.047, -0.704, 2.215, -0.454, -0.454, 0.964, -0.37, -0.537, 0.38, -0.37, 0.13, 1.44, 0.202, -0.151, -2.095, -0.799, -1.153, 0.556, -1.683, 0.792)
fifaRaw <- c(fifaRaw, -0.033, -1.094, -0.917, -0.151, 0.379, 0.556, -0.21, -0.151, 1.145, 0.144, 0.085, -1.27, 1.086, -1.919, -0.151, 0.909, -1.565, 0.438, 1.204, 0.674, 1.086, -0.505, -0.269, 0.556, 0.32, -2.036, -0.033, -0.092, -0.151, -0.092, 0.556, 0.32, 0.556, -2.213, 1.027, 1.44, 0.261, 0.909, 1.322, -1.919, 0.556, 0.202, 0.379, 0.32, -1.27, 0.909, -2.036, 0.85, -0.917, -1.27, -0.21, 0.144, 0.968, 0.615, 0.909, 0.438, -0.092, 0.32, 0.026, -1.683, -0.446, 0.674, -0.563, -0.976, 0.733, 0.144, -2.213, 0.379, 0.733, -0.328, 0.202, -0.387, 0.674, 0.674, -0.505, -1.919, 0.438, -0.328, 0.085, 0.026, 0.85, -0.151, 0.497, 1.734, -0.622, 0.674, -1.153, -0.269, 0.909, 0.968, -0.74, -0.21, 0.556, 0.733, 0.379, 0.438, -0.681, 0.733, 0.085, -0.033, -0.446, -2.861, -0.151, 1.381, -2.272, 0.085, 1.616, 0.202, -1.153, 0.556, 0.144, 0.32, 0.497, -1.977, 1.557, -2.154, 0.085, -0.505, 0.615, 0.438, -1.27, 0.615, 0.379, 0.733, -0.917, 0.379, 1.322, -0.092, -1.801, -0.269, -0.033, 1.204, 0.379, 0.556, 1.086, -0.21, 0.792, 1.499, -1.27, 0.085, 0.202, -0.563, 0.438, 0.615, -0.622, 0.438, 0.144, 1.322, 0.438, 0.674, 0.32, 0.497, 0.438, 0.792, 0.792, -1.035, 0.202, -0.269, -1.86, -1.388, -1.919, -0.033, 0.026, -0.269, -1.506, 0.909, 1.44, 0.674, 0.615, -0.21, 0.909, -0.681, -2.272, -0.622, -0.033, -0.446, -0.151, 1.322, -0.858, -0.622, 0.438, 0.909, 0.792, -2.154, -2.213, -0.269, -0.446, 0.497, 0.261, 0.85, -2.331, -1.624, -1.329, 0.968, 0.438, 1.263, -0.092, 0.497, 0.026, 0.202, 0.615, 0.968, -1.094, 0.615, 0.556, 0.438, 0.438, 1.263, -1.035, -0.092, -0.446, 0.792, 0.085, 0.733, 0.144, 1.557, 0.026, -0.328, -0.269, -0.387, -2.036, 0.792, 0.085, -0.151, 1.263, -1.624, 0.497, 0.261, -2.567, 0.261, -1.742, -1.919, 1.086, 0.674, 0.202, 1.086, 0.085, -0.681, -0.505, -0.21, 0.379, 0.615, 0.615, -0.21, 1.381, 0.144, 0.32, 0.261, -0.446, -2.39, 0.379, -0.563, -1.742, -2.036, -1.801, 0.32, 0.792, 1.381, 1.204, 1.793, 0.32, -1.035, 0.261, 1.145, -2.508, 0.438, 0.144, -0.269, 0.144, 0.497, -2.979, 0.85, 1.675, -1.094, 0.792, 1.086, -1.683, 1.086, 0.909, -0.681, -2.39, 0.026, 0.202, 1.499, 0.32, -2.036, -2.449, -1.683, 1.44, 1.145, -0.976, -2.449, 0.792, 0.379, -2.743, 0.085, -0.033, 0.909, 0.615, 1.145, 0.438, -0.092, 0.202, 0.085, 0.379, 0.615, -0.622, -1.27, 0.497, 1.086, 0.379, 0.026, 0.085, -1.506, -1.094, -0.151, 1.027, -0.622, 0.497, 0.085, -0.387, -0.269, -1.919, -0.092, -0.269, -1.624, 1.145, -0.446, 0.32, 0.085, -2.449, -0.681, 0.85, 0.026, 0.674, 1.675, -0.21, 0.438, 0.32, 1.263, 0.792, 0.909, 0.497, 0.497, 0.615, -1.094, -0.505, 1.027, 0.144, -1.565, -0.505, 0.909, 0.32, -0.21, 0.261, -0.033, 0.556, 0.144, -0.033, 1.499, 0.556, 0.379, -2.095, -0.033, -2.39, 0.32, -0.387, 1.145, 1.145, 0.85, -0.799, 0.379, -0.033, 0.968, -0.269, 0.674, -0.858, -2.036, -0.681, -1.742, 0.792, -0.446, 0.32, -0.21, -2.743, 0.261, 0.144)
fifaRaw <- c(fifaRaw, 1.145, 0.792, 0.144, 0.733, -1.212, 0.909, 0.085, 0.144, 0.792, 0.556, 0.438, 1.086, -1.919, 0.085, 0.556, -0.151, 1.263, 0.026, 0.202, 0.556, -0.446, 1.616, 0.202, 0.85, 0.497, -0.151, 1.322, 0.438, -2.154, 1.322, 0.144, 1.263, -0.033, -0.033, -2.92, 0.792, -1.565, -0.74, 0.674, -0.505, 0.674, 0.32, 1.263, 0.438, 1.381, 1.557, 0.379, 1.44, -0.622, 0.556, 0.497, 0.968, -1.447, 0.615, 0.497, 1.204, 0.379, -1.447, -1.742, 0.026, 0.556, -0.033, 1.322, -0.446, -0.21, 0.085, 1.086, 0.026, -0.799, -0.622, 1.145, 0.144, -1.565, -0.033, 0.909, -0.269, 1.734, 0.379, -1.683, -1.329, -1.624, 1.263, 0.968, -0.21, 0.379, 0.32, 1.027, -0.033, -0.446, 0.261, 0.556, 0.438, 0.085, 0.085, 0.497, 0.968, 0.615, -2.154, 1.204, -0.21, 0.674, 0.85, 0.085, -0.328, -0.138, -0.787, -0.381, -2.164, -1.597, -0.868, 0.753, -2.732, 0.753, -1.111, -1.678, 0.348, -0.057, 0.915, -1.192, 0.429, 0.834, 0.51, -0.3, -0.138, -0.543, 0.753, 0.105, -1.759, -0.057, 0.186, 0.105, 0.591, 0.915, -0.219, 1.158, -0.868, 0.186, 0.591, 1.32, 0.267, -0.543, 0.429, 1.32, 0.105, 0.915, 0.186, -0.868, -0.057, -0.462, -2.326, 1.077, -0.138, -1.759, -0.787, 0.105, -0.706, -0.949, -0.057, 1.077, 0.348, -2.326, -0.138, -1.84, -0.219, 0.186, 0.672, 0.105, 0.105, 0.51, -0.625, -0.625, -2.245, -1.516, 0.267, 0.105, 0.996, -2.488, 0.591, -0.3, -2.002, -0.787, 1.239, 0.51, 2.212, 2.131, 0.915, 0.105, 1.077, -0.138, 0.186, -0.706, -1.678, 1.483, 1.158, 1.077, 1.239, -1.759, -2.651, -0.462, 0.348, 0.672, -2.975, 0.51, -0.138, -0.138, -0.3, 0.51, 0.348, -0.219, 1.564, -0.219, -0.138, 0.834, 0.186, -1.111, 1.32, 0.024, -0.381, 0.51, 0.267, -0.625, -2.164, 0.591, 1.564, 0.024, -0.462, -1.921, 0.024, 0.51, 0.105, 1.726, -1.921, 1.483, 1.239, 0.186, -0.3, 0.51, -1.597, 1.726, 1.402, -0.462, 0.186, -1.435, 0.51, -0.057, 1.077, -1.03, 0.51, -1.759, 0.834, 0.105, -0.3, 0.024, -1.111, -0.381, 0.672, 1.564, 1.158, 0.915, -0.706, 0.753, 1.645, 0.996, 0.834, 0.672, 0.753, 0.834, -0.219, -1.597, -1.273, -0.625, -0.138, 0.267, 0.915, 0.429, -1.354, -1.03, 0.024, -1.759, 0.51, 0.348, 0.591, -1.435, 0.51, -1.678, -0.462, -1.678, 0.348, 0.186, 1.158, 0.672, 0.996, -0.868, -0.625, 1.483, 0.672, -0.381, -0.706, -0.706, 0.186, -0.787, -0.138, -1.759, -1.354, 0.996, -2.651, -0.3, -0.625, 0.672, -0.868, 0.024, 0.591, 1.158, 0.024, -0.462, 1.077, 0.105, 1.077, 0.267, -0.381, 0.186, 1.969, -0.787, 0.51, 0.348, -0.219, -2.245, 0.915, 1.32, 1.564, -0.3, -2.732, -2.894, -1.678, 0.186, 0.834, 0.834, 0.915, -0.3, -1.192, 0.429, -0.219, 0.753, 0.348, -0.057, -0.949, 0.591, 0.024, 0.915, -0.462, 0.672, -0.787, -1.111, 0.429, 0.024, -0.3, 0.348, 0.591, 0.672, 0.105, 1.564, -0.625, -0.057, 0.024, -1.03, 0.186, -0.219, -0.625, 0.51, -0.543, 1.077, 0.105, 0.267, -0.219, 0.834, 1.564, 1.239, -0.949, 0.672, 0.915, -2.326, 0.267, -0.138, 0.591, -0.057, 0.915, -0.057, 0.105, 0.996, -0.462, 0.996, 0.429)
fifaRaw <- c(fifaRaw, 0.915, -1.516, 0.591, 0.672, 0.186, 0.834, -0.3, -1.111, -0.381, 1.158, 0.672, -0.057, 0.834, 0.51, -0.057, -1.192, -0.057, 0.753, -2.245, 0.51, 0.834, 0.267, -0.949, 1.483, -0.219, -0.949, 0.51, -2.245, -1.678, 0.996, 0.024, 0.186, -0.625, 0.348, 0.591, -0.949, 1.888, -1.678, -2.732, 0.996, 0.672, 0.024, 1.807, 0.105, 0.753, -1.111, 0.915, 1.077, -0.381, 0.591, 0.105, -1.759, 1.807, 1.077, 1.645, -0.462, 1.239, -0.625, 0.267, 0.672, -0.219, -0.706, 1.402, -0.138, -1.273, 0.105, 0.267, -2.083, -0.138, 0.996, 0.024, -0.462, 0.348, 0.429, -0.787, 0.591, -1.111, 0.915, 0.915, 0.753, 0.591, 0.186, -0.381, 0.267, -0.462, -0.625, -0.381, -2.083, 0.429, 0.834, -0.381, -1.597, -1.516, 0.591, 0.915, -2.651, 1.402, -0.787, -0.381, 0.996, 2.212, 0.996, -0.787, 0.753, -0.625, -0.381, 0.996, 1.402, -0.381, 0.267, 0.105, 0.105, -2.002, -1.435, -1.435, 1.239, 0.753, 0.834, -0.057, -1.111, -0.138, -0.625, -0.219, 1.564, -0.462, -0.625, 0.915, 0.753, -0.057, 0.834, -0.706, 0.753, -0.381, -0.625, -0.543, 1.077, -0.706, -0.057, -1.111, 1.158, -0.543, 0.51, -1.84, 0.672, -0.057, -0.462, 0.429, 0.51, 0.348, 1.402, -0.219, 0.996, 0.591, 1.077, 0.348, 1.645, 0.429, 0.753, -1.597, 0.591, 0.672, 1.239, 1.158, -0.543, 0.429, -0.381, 0.591, 0.51, 0.024, -0.381, 0.996, -1.597, -1.516, -0.787, 0.429, 0.672, 0.753, -0.787, 1.239, -1.435, -0.057, 0.105, 1.077, -1.597, 0.591, 0.996, -0.3, -0.3, -2.002, 1.402, -0.057, 0.591, 0.672, 0.753, 0.348, -0.706, -0.787, 1.077, -0.138, -0.462, 1.726, -0.138, -1.516, 0.51, -0.219, -0.706, 0.915, -1.354, -0.381, -0.625, -0.3, 1.32, 1.229, 0.426, 0.627, -1.481, -0.377, -1.481, -0.226, 0.326, 0.075, 0.577, -0.477, 0.778, 0.778, 0.527, 1.279, 0.175, 1.229, 1.229, 0.978, -1.13, -1.581, 0.276, -1.33, -1.18, -0.828, -1.531, 0.928, -0.377, 0.727, 0.978, -1.079, -0.979, -0.728, -0.527, -1.481, 0.828, 0.276, -0.728, -1.431, 0.978, -0.477, 0.627, -1.28, 0.276, 0.125, 0.627, 1.129, 0.226, -1.732, 0.226, 1.581, 0.527, 0.125, -1.782, 0.677, -1.481, 1.129, -0.778, -1.933, 0.326, 1.129, -0.577, 1.079, -0.628, 0.426, 0.878, -0.126, -1.33, 0.376, 0.828, 0.577, 0.376, -0.527, 2.283, 0.075, -1.531, 1.029, 1.631, -0.176, 1.079, -0.678, -1.23, 0.125, -1.28, -1.782, 1.029, -1.481, -0.879, -1.029, 0.276, -0.728, 0.727, -0.778, 0.978, 0.928, 0.778, -0.979, 1.029, -0.728, 0.577, 0.426, 1.179, 0.476, -1.28, -0.828, -1.079, -1.079, 0.527, -0.678, 1.48, -1.782, -1.13, -0.226, -1.882, -0.477, 1.179, 0.677, -0.276, 0.828, 0.627, 0.426, 1.43, -1.631, 1.179, -1.732, -1.28, -0.929, 0.878, 0.276, -1.682, 0.276, -0.276, -0.377, -0.076, 0.226, 0.175, 0.727, -1.38, -0.427, -1.431, 0.828, -0.076, 1.882, 1.129, -0.076, -0.226, 0.627, -1.732, 0.778, -0.176, 0.577, 0.577, 0.577, -0.176, 0.878, 0.627, 0.476)
fifaRaw <- c(fifaRaw, 1.279, 1.029, -0.477, 0.627, 0.426, 0.577, -1.18, -1.732, 0.426, 0.727, -1.481, -1.732, -1.631, 0.226, 0.828, -0.678, -1.732, 0.125, 0.627, 0.527, 0.928, 0.075, -0.025, -0.025, -2.033, 0.778, -0.577, 0.727, 0.276, 1.179, -0.728, -0.076, 0.376, -0.527, 0.778, -1.983, -1.631, -1.18, 0.878, 0.476, -0.628, 0.778, -2.133, -1.832, -0.527, 0.878, -1.18, 0.476, -0.226, 1.029, 0.928, 1.38, 1.179, 0.878, -0.377, -0.377, -0.126, 1.631, -0.126, 0.878, -1.481, -0.929, -0.728, 0.677, -0.828, 0.677, 0.677, 1.53, -1.38, 0.276, 0.778, -0.327, -1.933, -1.079, 0.075, 0.476, 0.978, -1.732, 0.627, 1.38, -1.983, -0.276, 0.426, -1.531, 0.376, 1.279, 0.828, 1.43, 0.376, -0.427, -0.025, -0.778, 0.727, 0.878, 0.627, -0.879, 0.577, -1.28, 0.677, -0.678, 1.029, -1.882, 0.125, -0.126, -1.13, -1.983, -0.527, 1.179, -0.778, 1.129, 0.577, -0.327, 0.426, 0.025, 0.075, 1.079, -1.832, 0.376, -1.732, -0.076, 0.527, 1.129, -1.732, 0.025, 1.079, -1.33, 0.727, 0.928, 1.33, -0.577, 1.179, 0.978, -2.033, 0.878, 1.38, 1.129, 0.426, -1.631, -1.933, -0.025, 0.376, 1.33, -0.327, -1.631, 2.082, -0.577, -2.033, -0.226, -1.28, 1.279, 1.781, -1.13, 0.577, 0.778, -0.628, -1.18, 1.681, 0.978, -0.025, -0.076, 0.527, 1.38, 0.276, -0.879, 0.426, -1.732, 0.175, 1.179, 1.229, -0.327, 1.38, 1.079, -1.13, 0.878, -1.581, 0.276, 0.577, -1.732, -0.327, -1.079, -0.828, 0.577, -2.033, -0.377, 0.075, 0.928, 0.527, -0.276, 0.928, -1.33, -0.628, 0.276, 0.125, 0.878, -0.176, 0.527, 1.029, -1.33, -0.025, -0.176, -0.226, -1.481, 0.025, 0.778, -0.176, -0.628, -0.628, 1.279, -0.377, 1.279, 0.025, 1.33, 0.928, 0.978, -1.882, 0.125, -1.732, 0.778, -0.929, 1.129, 0.075, 0.928, -0.276, 0.376, -1.029, 0.376, 0.476, 0.727, -0.527, -1.782, 1.581, -1.23, 1.38, -0.879, -1.18, -0.327, -1.33, -0.427, 0.577, 1.129, 1.079, 1.079, -1.079, -1.431, 1.129, -0.577, 0.075, 1.029, 1.029, 0.778, 0.778, -1.631, -0.628, 1.279, 0.376, 1.179, 0.878, -1.28, -0.628, 0.928, 0.727, 0.828, 1.279, 1.129, 0.928, 1.079, 1.179, -1.481, -0.327, 0.978, 0.727, 1.229, 0.376, -1.882, 0.778, -1.732, 0.276, 1.129, 0.476, 1.43, -0.879, 0.226, 0.025, -0.025, 0.376, -0.276, 1.079, -1.23, 0.677, 0.025, 1.029, -1.682, -0.226, 0.627, 0.326, 0.978, -1.732, -0.076, 0.426, -0.076, 0.276, 1.33, -1.029, 0.175, 0.677, 0.928, 0.928, -0.728, -0.628, 0.527, 0.276, -1.732, 0.426, 0.878, -0.728, 0.978, -0.979, -1.33, -1.732, -1.732, 0.627, 0.778, -1.23, 0.778, -1.029, 0.727, -0.327, -0.527, 1.079, 1.53, 0.577, -0.226, -0.527, -0.678, 0.878, 0.677, -1.933, 1.179, -0.427, 0.727, 1.38, 0.125, -1.13, -0.79, 0.142, -1.373, -2.189, 0.434, -1.315, 0.725, -0.441, 1.016, -0.266, -0.965, -0.557, 0.375, 0.608, 0.026, 0.084, 0.725, 1.424, 0.084, -0.557, -1.723, 1.366, -1.373, -0.266, 0.375, -1.606, 1.075, 1.249)
fifaRaw <- c(fifaRaw, 1.424, 0.492, 0.55, 0.2, 0.375, 0.725, -1.198, 0.375, -0.324, 1.133, 0.725, 1.016, 1.249, 0.492, -1.548, -1.315, 0.725, -1.198, 0.317, 0.084, -1.256, 0.55, 0.841, 0.492, -0.79, -1.14, 1.482, -1.897, 1.249, 0.55, -2.247, 0.026, -0.207, 1.133, -0.441, 0.841, 0.084, -0.207, 0.608, -0.324, -0.732, -0.441, -0.324, 1.075, -1.548, 0.259, 1.133, -1.431, 0.259, -1.14, 0.725, 0.958, 0.608, -0.79, 1.016, 0.084, -0.907, 0.958, -0.207, -0.441, -0.557, -0.033, 0.608, 0.783, 0.958, 0.434, 0.55, 0.841, -0.266, -1.14, 1.016, 0.026, 0.084, 0.2, -0.557, -0.149, 0.958, -0.207, 0.492, -0.615, 0.142, -0.149, -2.13, 0.434, 1.366, -2.189, -0.149, 1.657, 0.2, -1.198, 0.317, 0.026, 0.667, 0.375, -1.839, 1.133, -0.674, -0.382, 0.958, 0.9, 0.725, -1.315, -0.091, 0.608, 0.2, -1.14, -0.615, 1.949, -0.79, -2.072, -0.557, 0.142, 0.841, 0.783, 0.667, 0.317, -0.732, 1.133, 1.016, -1.082, 0.375, -0.557, -1.373, -0.033, 0.55, 0.9, 1.191, -0.091, -0.441, 1.482, 0.259, 0.841, 0.667, -0.266, 1.133, 0.259, -1.082, -0.79, -0.033, -1.606, -0.674, -1.14, 0.55, -0.382, -0.091, -1.023, -0.382, 1.191, -0.441, 0.317, 0.434, 0.492, -0.499, -1.839, -0.965, 0.841, 0.026, 1.133, 0.259, 1.308, -0.091, 0.2, 1.133, 1.016, -1.606, -2.305, -0.557, 0.2, -0.615, 0.667, -1.198, -2.538, -1.315, -0.615, 1.133, 0.958, 0.725, 0.084, -0.674, 0.55, 1.424, 0.55, -1.431, 1.016, 0.667, 0.142, 0.667, -1.489, -0.091, 1.075, -0.441, 0.084, 0.783, 0.2, -1.023, 0.375, 2.24, 0.841, -0.732, -1.315, -0.382, -2.189, 0.841, 1.016, 0.375, 1.191, -2.247, -0.382, 1.075, -1.606, 1.191, -0.79, -1.548, 0.026, -0.091, 0.55, 0.841, -0.557, 0.783, -1.373, -0.441, -1.373, -1.489, 1.249, -0.033, 1.716, 0.783, -1.664, 1.075, 0.317, -1.839, 0.434, -2.072, 0.084, -1.606, -0.324, 0.9, 0.317, -0.033, 0.841, 1.308, 0.841, 0.434, 1.949, 1.249, -1.839, -0.033, 0.2, -1.256, 1.133, 0.55, -1.082, 0.026, 1.308, -1.198, -1.606, -0.965, 0.55, 1.075, 1.133, 1.599, -2.189, 0.958, 1.89, 0.841, -0.149, -1.198, -2.305, -1.315, 1.424, 1.308, -1.373, -2.305, 0.667, -0.615, -1.664, 0.2, 0.9, 0.375, 0.084, 0.9, 0.841, -0.499, 0.9, -0.033, 0.608, 1.133, -1.256, -1.14, -0.091, 0.492, -0.207, 0.55, -1.373, -1.606, -0.674, 1.366, 0.492, 0.259, 1.657, -0.674, -0.324, -0.674, -2.13, -0.79, -1.373, -1.664, 0.958, 0.2, -0.324, 0.55, -1.723, 0.375, 1.599, -0.266, -0.091, 1.191, -0.79, -0.324, 0.667, 0.608, 0.725, 1.075, -0.149, 0.608, -0.674, -1.956, -0.965, -0.324, 0.492, -1.198, -1.315, 0.2, 0.841, 0.084, 0.667, 0.55, 0.317, -0.557, 1.774, 1.133, 0.667, -1.664, -0.907, -0.091, -2.014, 0.259, -0.732, 0.259, 0.725, -0.207, 0.317, -0.79, 0.608, 0.841, -0.907, 0.55, -0.091, -1.373, 1.89, 1.016, -1.14, -0.499)
fifaRaw <- c(fifaRaw, 0.783, 0.375, -0.907, 0.783, 1.308, 1.016, 1.075, 0.841, -0.091, -2.305, 0.259, 0.259, 1.075, 1.249, -0.557, 0.725, 1.075, -1.781, 0.142, 0.841, 0.084, 0.841, -0.324, 1.191, 0.841, -0.499, 0.958, -1.198, 1.133, 0.2, -1.082, 0.608, 0.608, -1.023, 0.375, -0.848, 0.725, -0.965, 0.317, -2.422, 0.084, -1.956, -1.606, 0.725, -1.664, 1.191, 0.725, 1.075, 1.191, 1.075, 0.55, 0.841, 1.075, -0.499, 1.308, -1.14, 1.249, -2.014, 0.667, 0.142, 0.667, 1.016, -1.548, -1.256, 0.259, 1.075, 0.492, 1.308, 0.142, -1.723, -0.674, 0.958, -0.207, 0.2, 0.434, -0.033, 0.142, -2.364, -0.848, 0.9, -0.324, 0.841, 1.716, -0.848, -1.14, -1.664, 1.89, -0.033, 0.725, -0.149, 0.55, 0.2, -0.907, 0.026, 0.142, 0.317, -0.033, 0.375, -0.674, -0.149, 0.492, -1.082, -1.198, 0.608, -0.324, 1.482, -0.965, 0.434, 1.075, -1.686, 0.133, -0.537, -1.207, 0.277, -1.351, 0.516, -1.159, 0.085, -0.25, -1.255, 0.324, -0.106, 0.564, -0.968, -0.585, 0.994, 0.324, 0.851, 0.324, -1.351, 0.899, -1.063, 0.037, 0.659, -1.542, -0.681, 0.803, 1.042, 1.138, 0.468, 0.277, 0.516, 0.277, -1.255, -1.063, -0.872, 1.138, 0.851, 1.186, 1.234, 1.329, -1.542, 0.899, 0.899, -0.728, -1.016, 0.803, -1.59, -0.25, 1.377, -0.537, -1.255, -1.542, 0.611, -1.111, 1.09, 0.468, -1.829, -1.255, 0.564, 1.425, -1.494, 0.994, 0.324, 0.851, -0.202, 0.803, -0.872, 0.277, -0.681, -0.728, -1.063, 1.664, 0.946, -1.063, 0.037, 0.516, 1.138, -0.92, 0.946, 0.516, 0.994, 0.277, -1.207, -0.681, 0.181, 0.372, 0.229, 1.234, 1.473, 0.42, 0.994, 0.564, 0.085, -0.968, 0.564, -1.255, 0.899, -1.063, -1.255, 0.564, 0.707, -0.011, 0.994, -0.058, 0.659, -0.25, 1.09, 0.899, -1.159, 0.994, 1.616, -1.638, 0.707, 0.994, 0.372, -1.063, -0.154, -1.159, 0.611, 0.755, -1.733, 0.946, -1.111, 0.085, 1.234, 0.803, 1.09, -1.446, 0.803, 0.468, 0.42, -0.776, -0.489, 0.707, -1.063, -1.686, 0.468, 0.803, 1.042, 0.946, 1.042, 1.138, -1.159, 0.899, 0.659, -1.207, 0.324, -0.633, -1.542, -0.489, 0.851, 1.138, 1.281, 0.181, -1.446, 1.186, 0.803, 1.186, 0.803, -1.016, 0.468, 0.946, -1.59, -0.728, -0.968, -1.207, -1.542, -1.111, 0.611, 0.133, 0.611, -1.159, 0.803, 0.372, -1.063, 0.707, -0.154, 0.803, -0.92, -1.829, -0.776, 0.994, 0.229, -1.398, 1.042, 0.42, -0.441, 0.372, 0.946, 0.564, -1.925, -1.494, 0.181, 0.899, -0.633, 0.659, 1.138, -1.829, -1.638, -0.824, 0.277, 0.851, -0.776, 0.516, -1.542, 0.899, -0.585, 1.09, -1.446, 1.281, 1.09, 0.803, -0.441, 0.085, 0.851, 0.707, 0.324, 0.372, 0.899, 0.085, 0.229, 0.899, 0.946, 0.899, -1.063, -1.063, -1.398, -1.207, 0.707, 0.994, -1.398, 1.186, -1.111, -0.776, -0.537, -1.781, 1.186, -0.824, -1.733, 0.468, -0.633, 0.516, 1.042, -0.872, 0.803, -1.59, 0.372, -1.063, -1.686, 1.09, 0.564, 0.803, 0.946, -1.063, 1.138, 0.133, -0.92, 0.372, -0.537, 0.516, -1.686, -0.346, 0.851, 0.611, 0.755, 0.946, 1.616, 0.659, -1.207, 1.808, -1.111, -1.303, 0.181, 0.803)
fifaRaw <- c(fifaRaw, -1.351, -1.398, -0.776, -1.063, 0.468, 1.377, -1.159, -0.633, -1.351, 0.181, 1.473, 0.899, -0.824, -1.638, 1.042, 1.616, 1.281, -1.111, -1.351, -1.925, -1.016, 1.138, 1.377, -1.255, -1.446, 0.516, 0.277, -1.59, 0.564, 0.755, -0.25, -0.489, 0.803, 0.946, -1.303, 0.611, 0.851, 0.899, 1.09, -1.255, -1.446, 1.042, 0.803, -0.824, 0.707, -1.398, -1.686, -1.494, 1.281, 0.372, -0.25, 1.616, -0.537, 0.324, -0.25, -1.351, -1.111, -1.351, -1.59, 0.564, 0.707, 0.707, 0.755, -1.59, 0.755, 1.281, 0.277, -0.489, 0.803, -1.303, -0.106, 0.899, 0.899, 0.516, 1.234, 0.324, 0.659, 0.229, -1.303, 0.037, -0.393, 1.09, -1.638, -1.59, 0.994, 1.138, 0.468, 0.946, 0.851, 0.372, -0.92, 1.76, 0.946, -1.733, 0.277, -1.59, -1.207, -1.59, -0.106, 0.229, 0.229, 1.09, -0.537, -1.159, -1.303, 0.707, 0.851, -1.063, 0.659, 0.468, -1.686, 0.516, 1.521, -0.585, -0.106, 0.946, 0.707, -0.92, 1.281, 1.09, 1.09, 1.138, 1.329, -0.011, -1.303, -0.728, 0.037, 0.707, 0.707, 0.707, 0.229, 1.09, -1.207, 0.707, 1.09, 0.372, 1.042, -1.063, 0.803, 0.994, -1.063, 0.707, -1.59, 1.569, 0.899, 0.564, 0.324, -0.872, -1.494, 0.755, -0.441, 0.611, -0.968, -0.393, -1.877, 0.659, -1.829, -1.542, 0.946, -1.303, 1.234, 0.899, 1.281, 1.234, 1.234, 0.564, 0.564, 1.281, 0.516, 0.611, -1.303, 0.851, -1.686, 0.659, 0.516, 0.564, -0.298, -0.968, -1.255, -1.686, 0.037, 0.707, -0.537, 0.133, -1.255, -0.011, 0.899, -1.398, 0.755, -0.154, -0.441, 0.181, -1.111, -1.686, 1.281, -1.542, 0.851, 0.468, -1.59, -1.207, -1.207, 2.047, -0.011, 0.611, -0.298, 0.659, 1.281, -1.303, -0.154, -0.872, -1.207, 0.946, 0.803, 0.516, -0.154, -0.489, -1.303, -1.207, 1.856, 0.229, 1.138, -0.25, 0.611, 0.707, 1.227, 0.436, 0.881, -1.738, -0.848, -1.688, -1.293, -0.453, 0.535, 0.584, -0.404, 0.881, 0.683, 0.239, 1.177, 0.14, 1.029, 1.573, 0.98, 0.14, -1.441, 0.337, -1.787, -0.008, 0.09, -1.886, 0.634, 0.733, 0.288, 0.683, -1.342, 0.337, 0.041, -1.243, -1.54, 1.326, 0.337, -0.008, -0.7, 0.584, -1.046, 0.436, -2.034, 0.535, 0.782, 1.128, 1.078, 0.584, -1.836, -0.107, 0.98, 0.535, 0.288, -1.639, 0.98, -1.688, 0.93, -0.404, -2.182, 1.276, 0.387, 0.337, 0.98, 0.634, 0.337, 0.387, 0.14, -1.293, -0.305, 0.683, 0.486, 0.634, 0.14, 1.474, 0.782, -1.738, 0.387, 0.535, 0.189, 1.326, -0.552, -1.194, 0.239, 0.584, -1.787, 0.634, -0.996, -0.404, -0.947, -0.255, -0.947, 0.782, 0.288, 0.337, 0.831, 1.523, -0.996, 1.276, -0.107, 0.387, 0.881, 0.831, 0.683, -1.243, 0.387, -1.243, 0.14, 0.337, -0.552, 0.733, -2.281, -1.243, 0.288, -2.182, -1.342, 0.831, 0.634, -0.206, 0.337, 0.782, -0.601, 0.93, -1.688, 0.733, -1.688, -1.293, -0.255, 0.98, 0.535, -1.738, 0.535, -1.046, -0.206, 0.189, 1.029, 0.239)
fifaRaw <- c(fifaRaw, 0.535, -2.133, -0.996, -0.058, 0.634, 0.041, 1.177, 0.683, 0.436, 0.436, 1.177, -1.836, 1.029, 0.337, 0.535, 0.93, -0.255, -1.145, 0.535, 1.029, 0.881, 1.128, 0.535, -1.54, 0.436, 0.93, -0.305, 0.14, -1.738, 0.831, 0.337, -1.738, -1.886, -1.738, 0.486, 1.078, 0.041, -1.639, -0.157, 0.683, 0.782, 0.683, 0.486, 0.189, -0.157, -2.034, 0.239, -0.354, 0.733, 0.831, 0.288, -0.7, 0.189, -0.157, -0.601, 1.177, -2.034, -1.688, -0.898, 0.634, 1.128, 0.041, 1.029, -2.133, -1.935, -0.008, 0.683, 0.584, 0.98, -0.404, 0.782, 0.98, 1.326, 0.634, 0.239, -0.947, 0.683, -0.157, 1.474, 0.387, 0.733, -1.293, -0.255, -0.157, 0.782, -0.799, 0.288, -0.7, 0.387, -1.095, 0.98, 0.486, 0.189, -2.232, 0.387, 0.535, 0.634, 0.436, -1.935, 0.634, 1.029, -2.232, -0.651, 0.387, -2.034, 0.041, 1.128, 0.535, 0.387, -0.206, -1.046, -0.206, -0.354, 0.436, 0.93, -0.157, -1.046, 0.14, -1.293, 0.337, -0.947, 0.881, -2.182, 0.634, -0.058, -1.392, -2.083, 0.14, 0.09, 0.387, 0.93, 0.881, 1.029, 0.436, 0.535, -1.243, 1.523, -1.935, 0.239, -0.601, 0.09, 0.535, 1.375, -1.886, -0.799, 1.177, -1.985, 1.326, 1.523, 1.029, 0.733, 0.782, 1.622, -2.133, 0.486, 0.98, 1.227, 0.288, -1.589, -2.133, 0.09, 0.782, 0.239, 0.535, -1.441, 1.82, 0.337, -2.232, 0.337, -1.194, 1.029, 1.474, -1.046, -0.749, 0.93, 0.239, -0.996, 1.523, 0.634, 0.387, 0.189, 0.683, 0.584, 0.239, -0.7, 0.683, -1.886, 0.782, -0.848, 1.573, 0.09, -0.058, 1.078, -1.342, 0.634, -1.836, 0.881, 0.288, -1.886, 0.337, 0.881, -1.441, 0.189, -2.182, -0.552, -0.848, 0.782, 0.881, 0.041, 0.486, -1.342, 0.14, 0.634, 0.584, 1.128, -0.058, 0.337, 1.029, -1.589, -0.354, 0.337, -0.601, -1.639, 0.189, 1.078, 0.486, -0.7, -0.305, 0.634, 0.041, 1.227, -0.404, 0.486, 0.831, 0.831, -1.441, 0.288, -1.886, 0.535, -0.058, 0.486, 1.177, 1.029, 0.189, 0.733, -1.342, -0.255, 0.189, 0.14, -1.145, -1.738, 0.93, -1.441, 1.078, -0.305, 0.041, -0.255, -1.935, -0.206, -0.157, 0.337, 0.436, 0.782, -0.404, -1.688, 1.424, 0.288, -0.058, 0.881, 0.733, 0.337, 1.128, -1.935, 0.436, 0.733, 0.733, 0.881, 1.177, -0.947, -0.404, 0.239, 0.683, 1.029, 0.93, 0.337, 0.584, 0.535, 1.029, -1.738, 0.634, 1.128, 0.782, 1.128, 0.09, -2.232, 0.387, -2.083, -0.255, 0.881, 0.98, 1.029, -1.293, 0.535, -0.404, 0.189, 0.288, -0.206, 0.881, -0.996, -0.157, 0.387, 1.177, -1.935, 0.239, 0.93, 0.436, 1.177, -1.589, -1.836, 0.337, -0.453, 0.337, 1.128, -1.194, -0.008, 0.782, 0.337, 1.227, -1.342, -0.749, 0.93, 0.387, -1.738, 0.436, 1.227, 0.584, 0.634, -0.947, -1.886, -1.787, -1.589, 0.535, 0.782, -0.898, 1.276, 0.09, 0.189, 0.733, -0.354, 1.276, 1.128, 0.782, -0.848, -1.54, -0.354, 1.177, 0.584, -1.738, 0.337, -0.354, 0.288, 1.029, 0.041, -1.095, 0.517, 0.517, 0.731, -1.907, 0.374, -0.838, -1.123, 0.16, 0.089, 1.016, 0.517)
fifaRaw <- c(fifaRaw, 0.232, 0.374, 0.588, 1.586, -1.693, 1.658, 1.301, 1.016, -0.41, -1.693, 0.588, 0.374, -1.693, -0.125, -0.41, 0.588, 0.731, 0.873, 1.515, -1.337, -0.766, -1.265, -1.479, 0.089, 0.659, -0.053, -0.909, -1.194, 0.873, -1.693, 0.802, -0.553, 1.016, 0.374, 0.731, -0.053, 1.158, -3.119, 0.945, 1.8, 0.446, 0.873, -1.622, 0.588, -0.624, 0.659, -1.337, -0.838, 0.873, 0.731, -0.125, 0.374, -0.053, 0.089, 0.945, 0.16, -1.551, 0.802, 0.232, 0.089, -0.339, 0.517, 2.299, 0.517, -1.052, 0.374, 1.016, 0.588, 0.16, -0.695, -1.622, -1.052, -1.194, 0.446, 0.659, -1.836, -0.481, -1.693, 0.374, 0.089, 0.089, -0.766, 1.23, 0.945, 1.301, -1.693, 1.301, -0.41, -0.909, 0.303, 1.087, 0.16, -1.265, -1.052, -0.41, -0.41, 0.018, -0.98, 1.444, -1.622, -1.693, 1.515, -1.479, -1.337, 0.446, 0.659, -0.909, 0.588, -0.196, 0.303, 1.016, -1.479, 1.158, -1.408, -1.693, -0.553, 1.586, 0.517, -1.907, 0.089, -1.622, -1.337, 0.089, 0.945, 0.16, 0.16, -2.905, -1.123, -1.265, 0.802, 0.588, 1.8, 0.945, -0.481, 0.018, 1.515, -0.98, 0.945, 0.588, -0.053, 1.444, -0.125, -0.909, 1.016, 0.517, 0.374, 1.158, 0.945, -2.263, 0.659, 0.303, -0.053, -0.053, 0.802, 0.802, 0.303, -0.267, -2.62, -0.267, 0.802, 0.945, -1.123, -1.836, 0.018, 0.446, 0.303, 1.087, -0.196, 0.588, 0.232, -1.551, 0.446, -0.695, 0.303, 0.232, 0.802, -1.622, 0.303, -0.481, -1.408, 1.158, -1.693, -0.766, -1.836, 1.087, -0.053, -1.052, 1.372, -0.125, 0.588, 0.16, 0.659, 0.303, 1.016, -0.053, 0.517, 1.301, 0.873, 1.515, 0.089, -0.339, -0.339, -0.481, 1.943, 0.873, 0.446, -1.194, -0.553, -0.624, 0.873, -1.052, 0.731, -1.123, 0.16, -1.408, 1.515, 0.517, 0.446, -1.337, -0.553, 0.232, 0.303, 1.444, 0.873, 0.802, 1.301, -1.907, -0.838, 0.232, -1.052, 0.588, 1.372, 0.802, 1.016, 0.446, -0.98, -0.267, -1.123, 0.588, 0.303, 1.087, -0.267, 0.374, -1.764, 0.16, -0.909, 1.158, -1.408, 0.945, -0.196, -1.551, -0.624, -0.41, 0.446, -0.339, 1.158, 1.016, 1.372, 0.374, -0.909, -1.907, 0.802, 0.089, 0.588, -2.121, -0.267, 0.802, 1.729, 0.16, -0.624, 1.158, -1.622, 1.016, 0.731, 0.945, 0.089, 1.087, 1.23, -0.838, 1.016, 1.301, 1.301, -0.053, 0.232, -1.764, -0.481, 1.087, 0.873, -0.624, -2.05, 1.943, -0.41, -1.978, 0.018, -1.337, 1.586, 2.37, -0.481, -0.053, -1.194, -1.551, -1.693, 1.586, 0.659, 0.089, -0.481, 0.089, 0.802, -0.053, -1.622, 1.016, -0.909, 0.018, -0.624, 1.586, -0.766, -0.553, 0.588, -1.194, 0.16, -1.265, 1.016, -0.196, -0.41, 0.303, 0.303, -1.265, 0.873, -1.479, 0.303, -1.337, 0.802, 1.515, -0.053, 1.372, -1.836, 0.303, 0.588, 0.731, 1.158, 0.089, 0.731, 1.444, -0.909, -0.196, 1.016, -0.624, -1.978, -0.41, 1.23, 0.018, -0.125, -1.408, 1.087, -0.695, 1.301, 0.16, 1.016, -0.553, 0.945, -2.05, -0.053, -0.125, 0.873, -0.553, 0.731, 0.731, 0.659, -0.481, 0.659, -1.265)
fifaRaw <- c(fifaRaw, 0.089, 0.446, -0.838, -0.339, -1.337, 1.586, -2.05, 1.087, -1.194, 0.303, -0.624, -1.408, -1.123, -0.339, -0.909, -0.339, 1.301, -0.909, -1.337, 0.731, 0.018, -0.267, 0.446, 1.016, 0.802, 1.087, -1.194, -1.123, 0.659, 0.945, 0.802, 0.018, -1.479, 0.089, -0.766, 0.446, 0.303, 1.444, 0.802, -0.196, 0.873, 0.517, -0.339, 0.374, 1.016, 0.374, 1.729, 0.588, -1.551, 0.945, -0.766, -0.267, 0.873, -0.553, 1.372, -1.337, 0.873, -0.41, -1.337, -0.267, 0.16, 1.016, -1.337, -0.196, -0.053, 1.158, -1.408, -0.196, -0.553, 0.303, 0.802, -0.766, -0.267, -0.553, -0.41, -0.125, 0.374, -0.624, -0.41, 0.731, 0.232, 0.374, -1.052, -0.267, 0.374, 0.802, 0.731, -0.053, 0.945, -0.695, 0.446, -1.194, -0.339, -1.123, -1.693, 0.659, 1.301, -1.265, 0.659, 0.16, 1.23, 0.446, 0.089, 1.016, 1.586, 0.446, -0.196, -1.907, -0.98, 0.16, 0.303, -1.265, 0.303, -0.053, 0.945, 1.871, -0.766, -1.836, 1.026, 0.712, 0.712, -1.606, 0.086, -1.042, -0.854, 0.211, -0.165, 0.023, -0.039, 0.838, -0.29, 0.587, 0.336, 0.462, 2.154, 0.838, 0.838, -1.042, -1.543, -0.603, -1.167, -0.729, -0.729, -1.543, 1.088, -1.105, 1.339, 0.712, -0.353, -0.165, -0.165, -0.353, -1.731, 0.336, -0.541, -1.23, -0.917, 0.587, -0.478, 0.274, -0.791, -0.165, -0.039, 0.399, 0.712, -0.165, -0.979, 0.399, 1.527, 0.086, -0.165, -1.857, 0.65, -1.105, 0.023, -0.729, -0.729, 1.339, -0.039, -0.29, 1.715, 0.9, 0.211, 0.023, -0.666, -0.541, 0.274, 1.214, 1.026, 0.65, 0.524, 1.59, 0.462, -2.233, 0.274, 0.336, -0.102, 1.088, -0.165, -0.541, -0.227, -0.478, -1.481, 1.402, -0.854, -0.666, -0.854, 1.402, -0.039, -0.227, -0.039, 0.462, 1.214, 1.652, -0.478, 1.59, -0.165, 0.65, 0.9, -0.29, -0.039, -1.418, -1.293, -0.791, -0.541, 0.524, -0.729, 1.778, -2.358, -0.666, 0.462, -2.358, -0.791, -0.165, 0.712, 0.838, 1.026, 0.838, 0.712, 1.026, -1.355, 0.775, -2.045, -0.979, 0.023, 0.963, -0.165, -1.606, -0.729, 0.211, -0.039, 0.336, 0.963, -0.165, 0.399, -2.233, -0.666, -0.603, -0.227, -0.039, 0.336, 0.838, 0.65, -0.478, 0.9, -1.481, 0.838, 0.838, 0.775, 1.339, -0.227, -1.167, 0.023, 0.775, 0.399, 1.652, 0.211, -1.105, -0.165, 0.399, -0.415, -0.603, -1.606, 0.838, 1.402, -1.418, -1.543, -2.107, 0.65, 0.65, -0.478, -1.543, -0.791, 0.462, 0.086, -0.353, 0.086, 0.274, -0.227, -2.107, -0.039, -0.165, -0.353, 1.276, 1.276, -0.227, -0.478, 0.524, -0.979, 1.088, -2.233, -1.919, -0.478, 0.65, 1.402, -0.29, -0.478, -2.17, -1.418, 0.086, 0.587, -0.353, 1.339, 0.149, 0.963, 1.339, 1.088, 1.214, 1.214, -0.415, -0.165, 0.211, 1.966, 0.023, -1.042, -0.791, -0.541, -0.854, 0.838, -0.791, 0.524, 1.715, 1.84, -0.729, 0.838, 1.151, 0.149, -2.045, -0.415, -0.603, 1.151, 0.775, -1.794, 0.399, 1.464, -2.17, 0.086, 0.462, -1.355, -0.227, 1.402, -0.29, -0.039, 0.775, 0.274, 0.9, -0.729, 0.399, 0.211, -0.102, -1.105, 0.399, -0.039, 0.274, -0.478, 1.527, -2.233, -0.353, -0.102)
fifaRaw <- c(fifaRaw, -0.102, -1.982, 0.149, -0.102, -0.791, 0.65, 0.462, -0.039, 0.149, 0.462, -0.603, 1.088, -1.982, -0.102, -0.979, 0.524, 0.524, 1.276, -1.355, -0.791, 1.151, -1.481, 1.088, 1.778, 1.402, -0.791, 1.464, 2.216, -1.857, 0.587, 0.9, 1.84, 0.712, -2.233, -1.857, 1.214, 0.023, 0.023, 0.775, -1.669, 1.464, 0.149, -1.794, 0.023, -0.102, 1.966, 1.652, -0.729, -0.415, 0.399, -0.353, 0.086, 1.339, 0.023, -0.102, -0.102, -1.105, 1.026, 0.274, -0.666, 0.838, -1.105, 0.023, -0.227, 1.464, -0.039, -0.541, 1.402, -0.478, 0.9, -1.481, 0.211, 1.214, -1.857, -0.227, -0.478, -0.854, 0.587, -2.233, 0.65, 0.023, 0.775, 0.524, -0.917, 1.276, -0.854, -0.165, -1.293, -0.541, 0.712, -0.603, -0.165, 1.151, -1.418, -0.603, -0.165, -0.165, -1.606, 0.712, 1.464, 1.339, -0.227, -0.478, 1.464, -0.165, 0.462, 0.211, 1.214, 1.088, 1.339, -1.669, 0.9, -2.358, 0.023, -0.854, 0.211, 0.211, 1.276, -0.666, 1.214, -0.854, -0.415, -0.102, 0.524, -0.227, -1.543, 2.028, -0.917, 0.838, -0.917, -0.478, -0.478, -0.165, 0.023, 0.336, -0.603, -0.29, 0.211, -1.042, -1.543, 0.462, 0.211, -0.666, 0.023, 1.214, 0.65, 1.715, -1.731, -0.791, -0.415, 0.838, 0.399, 0.587, -0.478, 0.149, 0.9, 0.587, 1.026, 2.216, 0.336, 0.65, 1.088, 0.775, -1.982, -0.165, 1.088, 0.023, 1.026, 0.211, -0.415, -0.791, -1.418, 1.026, 1.652, 1.214, 0.211, -0.666, 0.712, -0.227, 0.838, -0.039, -0.29, -0.227, -0.165, -0.415, -0.541, 0.963, -1.418, -0.478, 0.462, -0.165, 0.963, -2.233, -1.481, 0.775, -1.23, 0.211, 1.402, -0.666, 0.65, 1.088, -0.165, 1.214, -0.165, -0.29, 1.276, -0.353, -1.669, 1.088, 0.336, -0.039, 0.524, -0.729, -1.042, -1.982, -2.045, 1.715, 1.652, -0.541, 0.963, -1.919, 1.402, -0.039, 0.149, 2.028, 1.088, -0.478, -1.105, -0.603, -0.478, 1.026, 0.462, -2.421, 1.276, -0.541, 0.086, 1.402, -0.478, -0.979, 0.801, 0.31, -0.673, -1.573, -0.918, 0.064, -0.263, 0.392, 0.31, 0.392, 0.146, 0.555, -0.263, 0.31, 1.292, -0.673, 1.702, 2.029, 0.555, -0.591, -1.655, 0.637, 0.064, -1.901, 0.064, -0.345, 0.637, 0.473, 1.128, 1.702, -0.837, -1.328, -0.345, -0.918, -0.837, 1.128, -0.018, 0.637, 0.228, 1.128, 0.883, 0.064, 0.555, 0.146, 0.637, 0.637, 0.473, 0.883, -0.182, -0.345, 0.392, -0.837, -0.018, -1.41, 0.473, -1.819, 0.473, -0.427, -1.655, 0.31, 0.473, 0.555, 0.31, 0.473, 0.064, -0.1, -0.018, -1.082, 0.146, 0.146, -0.182, 0.392, -0.018, 2.193, 0.146, -0.509, 0.473, -0.918, 0.965, 1.047, -0.263, 0.473, -0.182, -0.182, 0.064, 0.228, -1.328, -1.655, -1.41, 1.128, 1.21, 0.637, 0.555, 0.31, 0.146, 1.783, -1.164, 1.538, -0.345, 0.31, 0.146, -0.509, 0.228, -0.837, -0.427, -0.591, -0.1, 0.146, -0.345, 1.292, -2.638, 0.637, 1.538, -2.965, -0.918, 1.128, 0.555, -1.655, -0.1, 0.228, 0.719, 1.047, -1, 0.473, -1.082, -1.328, 0.883, 0.719, 1.538, -1.082, 0.392)
fifaRaw <- c(fifaRaw, -0.918, -0.427, -0.755, 1.865, 0.31, -0.263, -3.539, -1.819, 0.228, 0.801, 0.473, 1.947, 0.801, -1.328, -0.345, 0.883, -0.182, 0.883, -0.1, -1, 1.456, -0.182, -0.427, 1.62, 0.064, 0.801, 1.374, 0.555, 0.473, 0.228, 0.637, -0.263, 0.637, 0.31, 1.047, -0.755, -0.345, -3.702, -0.182, 0.555, 1.21, -1.246, -0.263, -0.837, 0.31, 0.31, -0.018, -0.591, 0.392, -0.182, -2.228, -0.755, 0.555, 0.228, -0.345, 0.555, -0.182, -1.737, -1.328, -0.018, 1.128, -1.737, -2.147, -1.246, 0.228, 0.392, -0.509, -0.1, -0.182, -0.591, -0.509, 0.31, 0.228, 1.374, -1.164, 0.146, 1.538, 0.965, 1.374, -0.673, 1.128, 0.31, 0.555, 1.374, 0.392, 0.555, -0.018, -1.41, -0.755, 0.637, -1.164, 0.064, -1.082, 1.538, -1.082, 0.31, 0.555, -0.018, -2.147, 0.31, 0.883, 0.31, 1.292, -0.1, 0.31, 1.292, -2.31, 0.555, -0.182, -0.263, -0.1, 1.047, -0.018, 0.228, -0.263, 1.047, -0.918, -1.655, -0.1, 0.473, 0.883, -0.755, 0.392, -0.1, -1.41, 0.883, 0.965, -2.147, 0.146, -1.328, -0.673, -2.147, -1.41, -0.263, 0.146, 0.392, 0.637, 1.62, -0.182, -0.427, 1.21, 1.292, -1.901, 0.31, -0.427, -1, 0.965, 1.538, -0.1, -0.182, 1.783, -0.673, 1.128, 0.637, 0.801, 1.702, 0.146, 1.702, -2.802, 0.31, 2.111, 1.783, -0.755, 0.31, -1.573, -1.246, 0.965, 1.128, -0.591, -1.655, 1.292, -0.755, -2.802, -0.427, -0.755, 0.637, 1.947, 0.228, -0.345, 1.21, 0.228, -0.018, 1.62, 0.719, -1.082, -1.246, 0.228, 0.392, -0.673, -0.509, 0.31, -1, 0.637, 1.292, 0.883, -1.41, 0.883, 0.719, -1, 0.228, -1.983, 0.473, -0.263, -2.638, -0.427, -0.673, -0.263, 0.637, -2.556, 0.473, -0.1, 0.883, 0.473, -0.182, 1.128, -0.673, 0.228, 0.31, -0.591, 1.62, -1.328, 0.392, 0.064, -0.837, -1.655, -0.1, 0.064, -0.182, -1.082, 0.883, 0.637, -0.755, 0.801, 0.31, -1.082, 1.047, 1.374, 1.047, 0.719, 0.064, -0.755, -0.755, -0.918, 0.555, -1.41, -0.755, 0.392, 0.801, -0.755, -0.509, -1.246, 1.128, -0.427, -1.082, -1.492, -2.802, 0.31, 1.21, 1.21, -1.41, 0.555, -0.1, -1.082, 0.146, 0.719, -0.263, 0.473, 0.473, -1.246, -2.392, 1.292, -0.1, -0.673, 0.801, 0.801, 0.392, 0.719, -2.883, 0.555, 0.31, -0.182, 0.883, 0.228, 0.555, 0.719, -0.837, 0.473, -0.755, 1.783, 0.637, -0.182, 0.473, 0.965, -1, -0.018, 0.719, 0.801, 1.374, -0.263, -1.655, 0.064, -0.018, -1.246, 1.047, 0.392, 1.456, 0.146, 0.555, 1.21, 0.555, -0.509, -0.345, 1.456, -1.082, 0.228, -0.427, 1.047, -0.673, -0.1, 0.555, 0.555, 1.047, 0.31, 0.555, -0.918, 0.392, -0.673, 0.883, -1.164, -0.591, 0.146, 0.392, -0.427, -0.591, -1.082, -0.1, 0.146, 0.801, -0.263, 0.801, -1.082, 0.392, -0.1, -0.345, -0.918, -2.72, 1.947, 0.473, -0.018, 0.883, 0.146, -0.018, -0.673, -0.018, 1.62, 1.456, -1.082, 0.228, -0.345, -0.755, 0.555, 0.555, -1.655, 1.128, -0.837, 0.965, 1.783, -0.182, -1.164, 0.232, -0.472, -0.522, -1.428, 0.232, -1.377, 0.836)
fifaRaw <- c(fifaRaw, -0.774, 0.283, 0.836, -0.22, -0.371, 0.484, 0.685, -0.874, -0.824, 0.937, 0.081, 0.937, 0.383, -1.88, 0.031, -1.981, -0.422, 0.534, -1.73, -0.623, 0.635, 0.635, 1.138, 0.383, 0.383, -0.12, 0.685, -1.88, -0.522, -1.428, 1.289, 0.987, 1.238, 1.289, 0.635, -1.528, 0.635, 1.037, -0.925, -1.025, 0.635, -1.78, -0.874, 1.138, -0.422, -0.371, -1.78, 0.534, -1.629, 1.037, 0.434, -1.78, -0.874, 0.283, 1.49, -0.723, 1.138, 0.434, -0.623, -0.573, 0.786, -0.371, 0.484, -1.227, -0.723, 0.031, 1.238, 1.037, -1.88, -0.17, 0.735, 1.037, 0.333, 1.238, 0.735, 0.735, -0.673, -1.478, -0.774, 0.031, 0.434, 0.484, 1.238, 1.289, 0.283, 0.635, 0.132, -0.573, -0.522, 0.434, 0.735, 1.188, -1.478, -1.176, 0.635, 0.584, -0.17, 0.584, 0.333, 0.081, 0.182, 1.087, 0.735, -1.629, 1.188, 1.037, -1.981, 0.735, 0.735, -0.12, -0.673, -1.579, -0.673, 0.685, 0.283, -0.975, 1.037, -1.277, 0.182, 1.138, 0.937, 1.44, -1.83, 0.232, 0.584, 0.735, -0.472, -0.623, 0.182, -1.377, -1.981, -0.371, 1.037, 1.037, 0.735, -0.019, 0.735, -0.774, 0.836, 0.534, -1.73, -0.422, -0.422, -0.925, -1.377, 0.987, 1.138, 1.44, 0.534, -1.176, 1.339, 0.735, 1.238, 0.685, -0.925, 0.132, 1.138, -1.528, 0.383, 0.031, -1.377, -1.78, -1.126, 0.836, -0.975, 0.333, -1.931, 0.685, 0.685, -0.673, 0.283, -0.774, 0.735, -1.327, -1.377, -0.623, 1.087, 0.232, -1.327, 0.685, 0.735, -0.774, 0.534, 1.138, -0.371, -2.082, -1.629, 0.383, 0.383, -0.12, 0.786, -0.22, -1.327, -1.327, -0.371, 0.534, 0.635, -0.925, 0.484, -1.73, 0.383, -0.271, 1.238, -1.377, 1.087, 1.087, 0.685, -1.126, -0.371, 0.685, 0.987, 0.584, 0.333, 0.987, 0.232, -0.12, 0.886, 1.339, 0.886, 0.182, -0.925, -0.975, -2.132, 0.836, 1.087, -0.573, 1.641, -1.73, 0.081, 0.434, -1.679, 1.087, -0.874, -1.83, 0.031, -0.774, 0.987, 1.037, 0.383, 0.786, -1.126, 0.182, -0.522, -1.377, 1.138, 0.836, 0.937, 1.138, -0.321, 1.289, 0.232, -1.931, 0.786, -0.17, 0.031, -1.78, -0.975, 0.937, 0.635, 0.484, 0.534, 1.641, 0.635, -1.126, 2.043, -0.522, -1.629, 0.232, 0.584, -1.428, -0.673, 0.283, -1.277, 0.685, 1.842, -1.528, -1.227, -1.277, 0.534, 1.289, 0.886, 0.584, -1.981, 0.735, 1.389, 0.987, -0.12, -1.227, -1.83, -0.472, 1.238, 1.037, -0.623, -1.428, 0.383, 0.534, -2.082, 0.383, 0.836, 0.534, -0.774, 0.685, 0.886, -0.925, 0.937, 0.836, 0.434, 1.037, -0.573, -0.925, 0.836, 0.635, -1.428, 0.584, -1.277, -1.78, -0.12, 1.339, -0.774, -0.371, 1.389, -1.126, 0.584, -0.975, -1.679, 0.685, -0.925, -1.377, 0.333, 0.584, 0.383, 0.031, -1.78, 0.937, 1.238, -0.12, -0.673, 1.087, -1.428, -0.07, 0.886, 1.289, -0.07, 1.238, 0.283, 0.836, 0.031, -1.88, -0.22, -0.07, 0.987, -1.88, -1.025, 0.534, 1.238, 0.333, 0.886, 0.836, 1.087, -0.019, 1.641, 0.937, -1.629, -0.07, -1.78, 0.383, -1.88, -0.12, 0.132, 0.333, 1.238, -0.824, -0.371)
fifaRaw <- c(fifaRaw, -0.422, 0.735, 0.937, -0.522, 0.886, 0.383, -1.629, 0.685, 1.44, -1.327, 0.383, 1.087, 0.735, -1.83, 1.49, 0.786, 0.937, 1.389, 0.886, 0.333, -1.428, 0.534, -0.371, 0.534, 0.635, 0.031, 0.484, 1.238, -1.377, 0.735, 1.087, -0.12, 0.786, -0.874, 0.987, 1.238, -0.975, 0.836, -0.12, 1.641, 0.635, 0.635, 0.383, -0.623, -1.579, 0.635, 0.132, 1.037, -0.774, -0.07, -1.931, 0.383, -1.83, -1.176, 0.886, -1.327, 1.087, 0.635, 1.339, 1.339, 0.886, 0.836, 0.534, 1.238, 0.484, 0.685, -1.327, 0.584, -1.629, 0.987, -0.422, 0.786, -0.321, -1.277, -1.83, -1.78, 0.383, 0.232, -0.472, -0.522, -1.277, -0.321, 1.188, -0.422, 0.635, 0.635, -0.371, 0.081, -1.478, -1.478, 1.289, -1.025, 0.836, 0.081, -1.83, -1.78, -1.428, 1.993, -0.874, 0.484, -0.019, 0.735, 0.635, -1.428, -0.17, -1.227, -1.126, 0.534, 0.735, 0.534, 0.081, -0.573, -0.22, -1.679, 1.138, 0.534, 1.188, 0.132, 0.434, 0.937, -1.551, -0.023, -1.506, -1.281, 0.382, -1.641, 1.011, -0.697, 0.292, 0.337, -0.292, -0.068, -0.337, 0.786, -0.517, -1.641, 0.921, 0.292, 0.427, 0.337, -1.371, 0.696, -1.461, 0.247, 0.651, -1.551, -0.292, 0.966, 0.831, 1.011, 0.606, 0.696, 0.786, 0.831, -1.551, -0.922, -1.012, 1.281, 0.966, 1.146, 1.236, 0.741, -1.416, 0.292, 0.921, -1.057, -0.832, 0.786, -1.326, -0.248, 1.325, -0.742, -0.292, -1.461, 0.606, -1.596, 1.056, 0.786, -1.686, -1.191, 0.516, 1.325, -1.551, 1.056, 0.292, 0.786, 0.067, 1.281, 0.067, 0.831, -0.967, -0.292, -0.787, 1.415, 0.921, -1.641, -0.337, 0.337, 0.921, -0.967, 0.921, 0.831, 0.831, 0.247, -1.371, -0.697, 0.472, 0.561, 0.516, 1.191, 1.236, 0.651, 0.561, 0.921, 0.067, -0.967, 0.831, -1.236, 0.876, -0.877, -1.461, 0.651, 0.786, 0.472, 0.831, 0.741, 0.561, -0.113, 0.966, 0.696, -1.731, 1.146, 1.325, -1.551, 0.876, 1.056, 0.247, -0.742, -1.101, -1.506, 0.741, 0.786, -1.506, 0.741, -1.641, 0.472, 1.146, 0.651, 1.191, -1.461, 0.696, 0.247, 0.741, -1.326, -0.877, 0.786, -0.922, -1.371, 0.382, 0.921, 1.101, 1.101, 0.561, 0.966, -0.877, 1.236, 0.921, -1.461, -0.472, -0.337, -1.506, -0.517, 1.011, 1.011, 1.236, -1.236, -1.101, 1.146, 0.696, 1.281, 0.696, -1.146, 0.831, 0.876, -1.596, -0.742, -1.012, -1.416, -1.596, -1.506, 0.696, -0.517, 0.876, -1.686, 0.741, 0.292, -1.236, 0.292, -0.607, 0.696, -0.877, -1.641, -0.697, 1.101, -0.472, -1.191, 0.876, 0.696, -0.517, 0.292, 1.011, 0.202, -1.596, -1.461, 0.741, 0.921, -1.101, 0.741, 1.011, -1.461, -1.461, -0.517, 0.516, 0.831, -0.697, 0.427, -1.596, 1.011, -0.832, 0.786, -1.146, 1.101, 1.146, 0.606, -1.057, 0.112, 0.876, 0.876, 0.472, 0.516, 0.966, 0.382, 0.292, 0.786, 1.325, 0.921, 0.022, -1.326, -1.281, -1.551, 0.561, 1.056, -1.461, 1.37, -1.236, -0.248, -0.158, -1.506, 1.146, -0.158, -1.506, 0.382, 0.112, 0.651, 1.011, -0.517, 0.921, -1.641, 0.696)
fifaRaw <- c(fifaRaw, -0.742, -1.506, 1.101, 1.011, 0.921, 0.966, -0.203, 1.101, 0.247, -1.596, 0.427, -1.146, 0.696, -1.551, 0.022, 0.696, 0.786, 0.561, 0.651, 1.46, 0.696, -1.461, 1.73, -1.146, -1.506, 0.516, 0.786, -1.461, -1.326, -0.382, -1.641, 0.561, 0.876, -1.641, -0.967, -0.472, 0.157, 1.46, 0.966, -0.832, -1.641, 1.056, 1.191, 1.191, -1.012, -1.551, -1.506, -0.922, 1.281, 1.325, -0.967, -1.461, 0.651, 0.472, -1.641, 0.651, 0.921, -0.697, -0.517, 1.011, 0.831, -1.326, 0.921, 0.696, 0.651, 1.011, -0.877, -1.686, 0.966, 0.786, -1.057, 0.606, -1.191, -1.641, 0.516, 1.281, -0.158, -0.292, 1.46, -1.146, 0.741, -1.416, -1.416, -0.967, -0.832, -1.551, 0.831, 0.606, 0.786, 0.561, -1.641, 0.831, 1.056, 0.292, -0.877, 1.191, -1.191, 0.472, 0.786, 1.101, 0.561, 1.056, 0.382, 0.561, -0.472, -1.506, -0.158, 0.337, 0.786, -1.641, -1.371, 0.831, 1.146, 0.561, 1.146, 0.472, 0.382, -1.281, 1.64, 0.741, -1.236, -0.292, -1.416, -0.967, -1.596, -0.068, 0.247, 0.247, 1.011, -0.697, -0.742, -1.146, 0.651, 0.786, -0.248, 0.606, 0.472, -1.551, 0.516, 1.37, -0.787, 0.202, 1.281, 0.741, -1.596, 1.37, 1.146, 0.606, 1.415, 1.236, 0.022, -1.416, -0.697, 0.022, 0.651, 0.831, 0.337, 0.247, 1.325, -1.506, 0.651, 0.876, 0.651, 0.651, -1.551, 1.056, 1.191, -1.012, 0.651, -1.326, 1.37, 0.831, 0.651, -0.607, -0.652, -1.461, 0.472, -0.113, 0.741, -0.248, -0.158, -1.506, 0.651, -1.686, -1.236, 0.876, -1.101, 1.191, 0.741, 1.146, 1.011, 0.786, 0.831, 0.651, 1.46, 0.606, 0.606, -1.281, 0.921, -1.506, 0.831, -1.506, 0.966, -0.742, -1.146, -1.461, -1.326, 0.292, 0.651, -0.562, -0.292, -1.461, -0.652, 0.651, -1.101, 0.876, -0.248, -0.113, 0.112, -1.596, -1.012, 1.236, -0.922, 0.831, 0.516, -1.012, -1.461, -1.506, 2, -0.248, 0.606, -0.697, 0.651, 0.561, -1.236, -0.203, -1.146, -1.057, 0.921, 1.056, 0.921, 0.337, -0.562, -1.371, -1.686, 1.37, -0.248, 1.011, -0.113, 0.696, 0.516, -1.516, 0.053, -1.331, -1.377, 0.837, -1.562, 0.976, 0.099, 0.376, 0.56, 0.145, -0.685, -0.731, 0.699, -0.593, -1.608, 0.929, -0.316, 0.237, 0.237, -1.47, 0.791, -1.377, 0.053, 0.837, -1.423, 0.791, 1.022, 0.653, 0.791, 0.468, 0.745, 0.883, 0.699, -1.285, -0.962, -1.1, 1.206, 0.883, 0.976, 1.252, 0.422, -1.562, 0.883, 1.114, -1.1, -1.1, 0.653, -1.562, -0.778, 1.483, -0.778, 0.007, -1.562, 0.33, -1.377, 1.252, 0.745, -1.654, -1.562, 0.653, 1.621, -1.331, 1.16, -0.039, 0.33, -0.178, 0.976, -0.501, 0.745, -0.962, -0.501, -1.054, 1.068, 0.791, -1.562, -0.039, 0.468, 1.022, -1.562, 0.883, 0.837, 1.022, 0.468, -1.47, -0.731, 0.237, 0.33, 0.468, 1.206, 1.252, 0.653, 0.837, 0.929, 0.191, -1.193, 0.883, -1.147, 0.929, -0.916, -1.285, 0.653, 0.791, 0.468, 0.976, 0.468, 0.56, -0.178, 0.929, 0.284, -1.516, 1.022, 1.298, -1.47, 0.699, 1.252, 0.007, -1.147, -1.285)
fifaRaw <- c(fifaRaw, -1.423, 0.745, 0.653, -1.516, 0.791, -1.562, 0.284, 1.206, 0.422, 1.16, -1.239, 0.606, 0.284, 0.929, -1.516, -1.008, 0.699, -0.778, -1.377, 0.376, 0.929, 1.114, 0.791, 0.376, 1.022, -1.008, 1.252, 0.976, -1.608, -0.455, -0.316, -1.147, -1.147, 1.022, 0.976, 0.837, -1.054, -0.962, 0.883, 0.653, 1.298, 0.745, -1.193, 0.883, 0.929, -1.562, -0.731, -0.87, -1.47, -1.47, -1.562, 0.653, -0.547, 0.791, -1.654, 0.791, 0.284, -1.285, 0.284, -0.824, 0.745, -0.316, -1.47, -0.593, 1.114, -0.501, -1.47, 1.068, 0.745, -0.224, 0.284, 1.068, 0.237, -1.47, -1.331, 0.791, 0.837, -1.1, 0.883, 0.33, -1.516, -1.423, -0.27, 0.237, 0.791, -0.87, 0.468, -1.516, 0.883, -0.824, 0.791, -1.331, 1.068, 1.391, 0.699, -0.916, 0.099, 1.022, 0.929, 0.653, 0.56, 0.929, 0.56, -0.132, 0.745, 1.345, 0.976, -0.362, -1.377, -1.193, -1.47, 0.699, 1.16, -1.193, 1.298, -1.516, -0.039, -0.178, -1.562, 1.252, -0.27, -1.47, 0.237, 0.33, 0.422, 0.837, 0.145, 1.114, -1.516, 0.653, -0.731, -1.562, 1.068, 1.068, 0.976, 0.837, -0.962, 1.114, -0.316, -1.562, 0.33, -0.962, 0.514, -1.47, -0.132, 0.468, 0.791, 0.376, 0.422, 1.391, 0.745, -1.331, 1.898, -1.193, -1.654, 0.422, 0.745, -1.377, -1.423, -0.639, -1.608, 0.653, 0.699, -1.562, -1.054, -0.87, 0.007, 1.483, 0.883, -1.1, -1.47, 0.653, 0.929, 1.206, -0.962, -1.331, -1.654, -1.331, 1.252, 1.206, -0.87, -1.239, -0.178, 0.376, -1.654, 0.745, 0.883, -0.916, -0.316, 0.929, 0.929, -1.608, 0.837, 0.745, 0.929, 0.976, -0.778, -1.147, 1.16, 0.653, -1.1, 0.653, -1.377, -1.285, 0.653, 1.16, -0.501, 0.053, 1.714, -0.87, 0.56, -1.239, -1.423, -1.054, -0.639, -1.608, 0.791, 0.976, 0.883, 0.468, -1.516, 0.883, 1.252, 0.237, -0.408, 0.699, -1.377, 0.237, 0.929, 1.114, 0.376, 1.252, 0.237, 0.56, -0.824, -1.423, -0.178, 0.376, 0.929, -1.608, -1.285, 0.883, 1.16, 0.468, 1.114, 0.468, 0.653, -0.87, 1.714, 0.606, -1.377, -0.316, -1.193, -0.87, -1.516, -0.178, 0.376, 0.237, 1.206, -0.685, -0.778, -1.147, 0.883, 0.791, 0.053, 0.699, 0.514, -1.193, 0.376, 1.298, -0.962, 0.284, 1.114, 0.791, -1.654, 1.391, 1.114, 1.391, 1.76, 0.606, 0.33, -1.47, -0.824, 0.56, 0.791, 0.929, 0.33, 0.237, 1.345, -1.423, 0.606, 1.022, 0.606, 0.883, -1.285, 0.883, 1.252, -1.193, 0.883, -1.193, 1.483, 0.883, 0.699, 0.284, -0.639, -1.377, 0.33, -0.455, 0.929, -0.547, 0.099, -1.654, 0.699, -1.562, -1.377, 0.883, -1.193, 1.114, 0.837, 1.298, 1.022, 0.929, 0.745, 0.653, 0.883, 0.56, 0.653, -0.962, 0.837, -1.562, 0.837, -1.285, 0.976, -1.147, -1.423, -1.239, -1.331, 0.791, 0.883, -0.547, 0.007, -1.423, -0.962, 0.699, -1.377, 0.837, 0.422, -0.962, 0.099, -1.47, -1.008, 1.298, -1.054, 0.791, 0.699, -0.962, -1.608, -1.193, 2.083, -0.132, 0.56, 0.053, 0.699, 0.606, -1.008, -0.039, -1.239, -1.1, 0.791, 0.699, 1.022, 0.191, -0.593, -1.516, -1.423)
fifaRaw <- c(fifaRaw, 1.391, 0.053, 1.022, 0.284, 0.56, 0.745, -0.156, -0.523, -0.418, 2.156, -0.261, 3.364, -0.628, -0.576, -0.156, -0.523, -0.628, -0.471, -0.366, -0.261, -0.313, -0.576, -0.156, -0.576, -0.471, -0.208, 1.998, -0.366, 2.366, -0.418, -0.366, 2.313, -0.208, -0.628, -0.471, -0.313, -0.471, -0.523, -0.261, -0.156, 3.206, -0.471, -0.366, -0.366, -0.208, -0.471, -0.313, -0.261, 2.523, -0.313, -0.471, -0.313, -0.576, -0.523, 2.628, -0.681, -0.156, -0.523, -0.366, 2.996, -0.418, 2.838, -0.261, -0.523, 1.893, -0.471, -0.366, -0.156, -0.418, -0.313, -0.208, -0.681, -0.628, -0.313, -0.366, -0.156, -0.628, -0.418, -0.208, -0.418, -0.156, 2.733, -0.628, -0.156, -0.208, -0.156, -0.418, -0.576, -0.576, -0.313, 3.101, -0.103, -0.103, -0.261, -0.261, -0.366, -0.576, -0.261, -0.261, -0.366, -0.156, -0.261, -0.418, -0.418, -0.261, -0.261, -0.418, -0.628, -0.208, -0.208, -0.208, -0.156, -0.576, -0.261, -0.366, -0.576, 2.051, -0.156, -0.576, 1.946, -0.261, -0.418, -0.523, -0.261, -0.523, -0.156, -0.208, -0.471, 2.628, -0.471, 2.523, -0.523, -0.103, -0.576, -0.156, 2.313, -0.523, -0.681, -0.523, -0.471, -0.261, -0.103, -0.208, 2.103, -0.366, -0.208, -0.628, -0.471, -0.313, -0.103, -0.261, -0.261, -0.628, 3.101, -0.471, -0.366, -0.313, -0.576, -0.156, -0.313, -0.576, -0.156, -0.261, -0.103, -0.523, -0.208, -0.576, -0.628, -0.523, -0.366, 2.681, -0.471, -0.576, 2.733, 2.628, 2.523, -0.576, -0.366, -0.261, 2.418, -0.418, -0.418, -0.628, -0.523, -0.418, -0.156, -0.418, 1.42, -0.628, -0.366, -0.156, -0.208, -0.628, -0.103, -0.261, -0.156, -0.576, -0.208, 1.525, 2.366, -0.628, -0.418, -0.576, -0.208, -0.261, 2.576, 2.733, -0.628, -0.628, -0.208, -0.471, -0.681, -0.576, -0.366, -0.103, -0.523, -0.208, -0.628, -0.576, -0.261, -0.208, -0.523, -0.261, -0.313, -0.366, -0.313, -0.471, -0.208, -0.576, -0.576, -0.471, -0.418, -0.576, -0.313, -0.261, 2.261, -0.156, -0.628, -0.418, -0.313, 2.733, -0.576, -0.576, 2.313, -0.471, -0.576, 2.576, -0.418, -0.208, -0.576, -0.103, -0.208, -0.261, -0.208, -0.366, -0.156, -0.523, -0.208, -0.261, -0.208, -0.576, -0.261, -0.628, -0.208, 1.735, -0.471, -0.208, -0.628, 2.418, -0.523, -0.471, -0.418, -0.418, -0.103, -0.523, -0.418, -0.523, -0.471, -0.156, 3.154, -0.208, -0.576, -0.261, -0.418, -0.208, 2.103, -0.418, -0.471, 2.523, -0.576, -0.261, -0.523, -0.523, -0.523, -0.103, 2.313, -0.313, -0.523, -0.681, -0.523, 2.208, 2.366, -0.628, -0.576, -0.471, -0.366, 1.893, -0.261, -0.628, 2.523, -0.208, -0.156, -0.208, -0.366, -0.523, -0.628, -0.313, -0.208, -0.523, -0.208, -0.471, -0.576, -0.313, -0.261, -0.576, -0.313, -0.523, -0.156, 3.049, -0.261, -0.261, -0.576, -0.418, -0.313, -0.523, -0.471, -0.418, 2.628, -0.628, -0.471, 2.681, -0.523, -0.628, -0.418, -0.208, 2.156, -0.366, -0.261, -0.261, -0.471, -0.471, -0.313, -0.523, -0.471, -0.208, -0.208, -0.576, -0.156, -0.418, -0.523)
fifaRaw <- c(fifaRaw, 2.261, -0.366, -0.261, -0.103, 2.576, -0.103, -0.628, -0.418, -0.208, -0.208, -0.418, -0.471, -0.523, -0.261, -0.471, -0.418, -0.681, 2.786, -0.523, 2.471, -0.366, -0.628, -0.208, -0.471, -0.366, -0.261, -0.156, -0.208, -0.366, -0.681, -0.208, -0.471, 2.838, -0.523, -0.523, -0.208, -0.156, -0.366, -0.103, 2.156, -0.208, -0.208, -0.366, -0.156, -0.628, -0.523, 1.998, -0.576, -0.681, -0.418, -0.366, -0.418, -0.103, -0.103, 2.944, -0.576, -0.471, -0.366, -0.208, -0.366, -0.628, -0.681, -0.208, -0.628, -0.523, -0.208, -0.471, -0.628, -0.366, -0.418, 1.998, -0.523, -0.628, -0.681, -0.681, -0.261, 2.051, -0.208, 2.628, -0.418, -0.681, -0.471, -0.156, -0.208, -0.103, -0.418, -0.576, -0.261, -0.576, -0.576, -0.366, -0.523, -0.366, -0.156, 2.523, -0.471, -0.313, -0.313, -0.313, 3.101, 2.733, -0.208, -0.156, -0.523, -0.156, -0.471, -0.156, -0.366, -0.471, -0.576, -0.576, -0.628, -0.208, -0.576, 2.838, -0.681, -0.313, -0.366, -0.103, -0.208, 2.261, 2.681, 2.733, -0.366, -0.628, -0.261, -0.313, -0.313, -0.313, -0.471, -0.103, -0.313, 0.002, -0.313, -0.471, -0.156, -0.418, -0.156, -0.313, 2.681, -0.156, -0.523, -0.576, -0.418, -0.313, -0.523, -0.485, -0.323, -0.216, 1.987, -0.592, 3.115, -0.216, -0.538, -0.108, -0.699, -0.377, -0.538, -0.108, -0.485, -0.162, -0.216, -0.485, -0.323, -0.592, -0.162, 2.095, -0.538, 2.363, -0.431, -0.377, 2.256, -0.216, -0.377, -0.377, -0.27, -0.27, -0.646, -0.162, -0.162, 3.008, -0.485, -0.431, -0.162, -0.538, -0.485, -0.699, -0.323, 2.632, -0.699, -0.323, -0.592, -0.538, -0.216, 2.686, -0.699, -0.323, -0.592, -0.538, 2.739, -0.27, 2.793, -0.323, -0.699, 2.095, -0.162, -0.485, -0.377, -0.323, -0.377, -0.592, -0.323, -0.485, -0.162, -0.485, -0.592, -0.646, -0.377, -0.323, -0.377, -0.27, 2.578, -0.431, -0.162, -0.108, -0.485, -0.108, -0.592, -0.485, -0.538, 3.008, -0.162, -0.216, -0.431, -0.646, -0.108, -0.485, -0.431, -0.27, -0.431, -0.377, -0.216, -0.646, -0.162, -0.485, -0.431, -0.538, -0.431, -0.646, -0.485, -0.538, -0.377, -0.162, -0.162, -0.216, -0.162, 1.665, -0.216, -0.27, 1.826, -0.485, -0.216, -0.538, -0.538, -0.323, -0.162, -0.592, -0.108, 2.9, -0.216, 2.739, -0.323, -0.162, -0.216, -0.27, 2.471, -0.485, -0.162, -0.592, -0.592, -0.323, -0.485, -0.216, 1.88, -0.323, -0.216, -0.377, -0.323, -0.431, -0.162, -0.431, -0.216, -0.216, 2.954, -0.538, -0.216, -0.216, -0.377, -0.377, -0.27, -0.377, -0.27, -0.377, -0.216, -0.699, -0.646, -0.216, -0.485, -0.162, -0.431, 2.686, -0.431, -0.485, 2.847, 2.632, 2.847, -0.162, -0.323, -0.699, 2.202, -0.162, -0.592, -0.323, -0.377, -0.162, -0.216, -0.431, 2.202, -0.592, -0.27, -0.377, -0.216, -0.431, -0.27, -0.377, -0.162, -0.646, -0.27, 1.933, 2.578, -0.485, -0.323, -0.27, -0.216, -0.216, 2.471)
fifaRaw <- c(fifaRaw, 2.847, -0.646, -0.377, -0.377, -0.538, -0.377, -0.592, -0.216, -0.431, -0.27, -0.646, -0.538, -0.27, -0.108, -0.538, -0.27, -0.323, -0.538, -0.323, -0.485, -0.431, -0.592, -0.538, -0.485, -0.108, -0.216, -0.485, -0.592, -0.27, 2.256, -0.377, -0.323, -0.377, -0.323, 2.9, -0.538, -0.592, 2.202, -0.27, -0.538, 2.632, -0.27, -0.538, -0.27, -0.485, -0.377, -0.431, -0.27, -0.538, -0.216, -0.592, -0.162, -0.27, -0.538, -0.108, -0.216, -0.162, -0.162, 1.718, -0.162, -0.27, -0.162, 2.417, -0.431, -0.485, -0.485, -0.216, -0.323, -0.27, -0.592, -0.323, -0.377, -0.377, 2.632, -0.538, -0.646, -0.485, -0.485, -0.485, 2.686, -0.162, -0.108, 2.686, -0.646, -0.323, -0.377, -0.485, -0.592, -0.323, 2.309, -0.323, -0.323, -0.592, -0.431, 2.202, 2.202, -0.485, -0.377, -0.377, -0.216, 2.363, -0.377, -0.485, 2.148, -0.485, -0.323, -0.592, -0.162, -0.27, -0.485, -0.377, -0.646, -0.538, -0.377, -0.646, -0.216, -0.646, -0.592, -0.538, -0.323, -0.108, -0.431, 2.847, -0.592, -0.162, -0.485, -0.27, -0.108, -0.377, -0.431, -0.377, 3.115, -0.592, -0.323, 2.686, -0.646, -0.162, -0.538, -0.323, 1.826, -0.216, -0.592, -0.323, -0.431, -0.485, -0.162, -0.431, -0.108, -0.485, -0.216, -0.27, -0.538, -0.162, -0.377, 2.417, -0.592, -0.431, -0.27, 2.524, -0.538, -0.216, -0.27, -0.699, -0.646, -0.323, -0.431, -0.323, -0.377, -0.162, -0.216, -0.323, 2.524, -0.431, 2.578, -0.592, -0.431, -0.216, -0.108, -0.162, -0.485, -0.592, -0.216, -0.431, -0.377, -0.592, -0.377, 2.847, -0.592, -0.592, -0.538, -0.646, -0.323, -0.431, 2.471, -0.27, -0.377, -0.699, -0.323, -0.538, -0.592, 1.826, -0.699, -0.323, -0.592, -0.538, -0.485, -0.377, -0.323, 2.632, -0.592, -0.485, -0.646, -0.377, -0.485, -0.377, -0.538, -0.162, -0.646, -0.538, -0.27, -0.538, -0.592, -0.108, -0.216, 2.041, -0.162, -0.27, -0.377, -0.538, -0.108, 2.095, -0.592, 2.739, -0.646, -0.162, -0.323, -0.431, -0.27, -0.108, -0.108, -0.323, -0.377, -0.431, -0.377, -0.538, -0.538, -0.646, -0.592, 2.739, -0.108, -0.216, -0.108, -0.323, 3.115, 2.739, -0.377, -0.323, -0.216, -0.431, -0.431, -0.162, -0.646, -0.162, -0.646, -0.431, -0.431, -0.699, -0.431, 2.686, -0.162, -0.162, -0.646, -0.162, -0.108, 2.524, 2.9, 1.826, -0.538, -0.216, -0.485, -0.592, -0.162, -0.485, -0.377, -0.162, -0.592, -0.108, -0.431, -0.216, -0.699, -0.485, -0.27, -0.485, 2.471, -0.431, -0.485, -0.323, -0.538, -0.377, -0.162, -0.525, -0.25, -0.25, 1.999, -0.36, 2.328, -0.525, -0.47, -0.47, -0.25, -0.47, -0.305, -0.36, -0.579, -0.195, -0.579, -0.14, -0.14, -0.579, -0.305, 1.999, -0.525, 2.493, -0.305, -0.634, 2.219, -0.525, -0.525, -0.086, -0.525, -0.305, -0.47, -0.415, -0.525, 2.822, -0.47, -0.415, -0.14, -0.086, -0.36, -0.36, -0.415, 2.658, -0.634, -0.634, -0.525, -0.305, -0.086, 2.658, -0.579, -0.195, -0.086, -0.47)
fifaRaw <- c(fifaRaw, 3.371, -0.579, 3.042, -0.086, -0.36, 2.054, -0.36, -0.525, -0.25, -0.25, -0.634, -0.579, -0.634, -0.305, -0.14, -0.47, -0.415, -0.525, -0.415, -0.634, -0.25, -0.525, 3.316, -0.36, -0.25, -0.25, -0.525, -0.195, -0.47, -0.579, -0.579, 2.767, -0.47, -0.25, -0.525, -0.47, -0.525, -0.579, -0.634, -0.525, -0.086, -0.14, -0.579, -0.47, -0.14, -0.36, -0.579, -0.579, -0.525, -0.579, -0.195, -0.305, -0.305, -0.195, -0.415, -0.525, -0.47, 2.548, -0.689, -0.36, 1.999, -0.525, -0.579, -0.634, -0.47, -0.47, -0.47, -0.579, -0.14, 2.383, -0.579, 2.658, -0.305, -0.25, -0.25, -0.14, 2.822, -0.305, -0.14, -0.47, -0.579, -0.36, -0.25, -0.195, 1.89, -0.525, -0.47, -0.47, -0.25, -0.47, -0.305, -0.305, -0.195, -0.415, 3.206, -0.086, -0.634, -0.25, -0.525, -0.086, -0.579, -0.634, -0.47, -0.415, -0.25, -0.634, -0.14, -0.36, -0.195, -0.525, -0.305, 3.206, -0.305, -0.305, 2.822, 2.328, 2.767, -0.086, -0.25, -0.195, 2.274, -0.525, -0.195, -0.525, -0.579, -0.415, -0.25, -0.195, 1.56, -0.634, -0.415, -0.36, -0.525, -0.14, -0.415, -0.47, -0.195, -0.579, -0.305, 2.219, 2.548, -0.579, -0.195, -0.305, -0.415, -0.47, 2.822, 3.097, -0.195, -0.195, -0.305, -0.086, -0.525, -0.579, -0.47, -0.634, -0.36, -0.689, -0.525, -0.634, -0.36, -0.525, -0.525, -0.47, -0.305, -0.525, -0.305, -0.415, -0.415, -0.195, -0.086, -0.525, -0.305, -0.525, -0.305, -0.525, 1.89, -0.415, -0.47, -0.195, -0.36, 2.658, -0.415, -0.579, 2.219, -0.47, -0.195, 2.274, -0.36, -0.195, -0.086, -0.36, -0.195, -0.195, -0.14, -0.25, -0.579, -0.195, -0.525, -0.36, -0.47, -0.634, -0.47, -0.415, -0.36, 1.944, -0.086, -0.634, -0.25, 2.328, -0.525, -0.305, -0.47, -0.25, -0.525, -0.47, -0.305, -0.36, -0.579, -0.579, 2.767, -0.305, -0.195, -0.525, -0.579, -0.634, 2.438, -0.25, -0.634, 2.438, -0.36, -0.415, -0.195, -0.689, -0.14, -0.25, 2.493, -0.25, -0.305, -0.195, -0.25, 2.493, 2.219, -0.36, -0.47, -0.195, -0.25, 2.658, -0.25, -0.25, 1.725, -0.305, -0.14, -0.525, -0.47, -0.36, -0.36, -0.634, -0.525, -0.689, -0.195, -0.579, -0.36, -0.525, -0.305, -0.415, -0.579, -0.579, -0.195, 2.603, -0.25, -0.579, -0.415, -0.579, -0.36, -0.086, -0.525, -0.086, 2.164, -0.36, -0.195, 2.603, -0.579, -0.305, -0.36, -0.47, 1.999, -0.14, -0.36, -0.579, -0.47, -0.195, -0.579, -0.689, -0.47, -0.14, -0.634, -0.36, -0.634, -0.579, -0.525, 2.548, -0.36, -0.634, -0.305, 2.713, -0.579, -0.305, -0.305, -0.634, -0.195, -0.25, -0.36, -0.36, -0.47, -0.579, -0.36, -0.25, 2.658, -0.525, 2.713, -0.195, -0.25, -0.525, -0.195, -0.305, -0.195, -0.25, -0.36, -0.14, -0.579, -0.579, -0.634, 2.658, -0.47, -0.415, -0.14, -0.634, -0.195, -0.14, 2.493, -0.415, -0.47, -0.525, -0.25, -0.47, -0.634, 1.725, -0.415, -0.305, -0.415, -0.634, -0.634, -0.25, -0.14, 2.328, -0.195, -0.47, -0.086, -0.14, -0.47, -0.195)
fifaRaw <- c(fifaRaw, -0.689, -0.634, -0.086, -0.305, -0.195, -0.36, -0.305, -0.525, -0.579, 2.219, -0.25, -0.47, -0.579, -0.634, -0.086, 2.109, -0.195, 1.56, -0.36, -0.195, -0.47, -0.195, -0.14, -0.195, -0.086, -0.47, -0.36, -0.525, -0.14, -0.25, -0.36, -0.579, -0.579, 2.603, -0.47, -0.47, -0.579, -0.525, 2.767, 3.261, -0.36, -0.525, -0.305, -0.305, -0.47, -0.195, -0.25, -0.634, -0.47, -0.305, -0.305, -0.525, -0.305, 2.438, -0.634, -0.305, -0.579, -0.086, 0.683, 2.438, 2.603, 2.713, -0.47, -0.525, -0.25, -0.195, -0.634, -0.634, -0.305, -0.195, -0.47, -0.25, -0.415, -0.14, -0.305, -0.579, -0.25, -0.36, 2.713, -0.14, -0.305, -0.195, -0.47, -0.305, -0.25, -0.567, -0.356, -0.144, 2.074, -0.408, 3.078, -0.144, -0.62, -0.197, -0.408, -0.461, -0.356, -0.514, -0.408, -0.514, -0.461, -0.197, -0.461, -0.197, -0.144, 2.022, -0.197, 2.391, -0.303, -0.514, 2.339, -0.567, -0.461, -0.62, -0.408, -0.197, -0.25, -0.144, -0.25, 3.237, -0.303, -0.567, -0.303, -0.408, -0.514, -0.197, -0.25, 2.603, -0.25, -0.356, -0.408, -0.567, -0.197, 2.708, -0.62, -0.144, -0.514, -0.356, 2.761, -0.356, 2.708, -0.567, -0.514, 2.497, -0.567, -0.091, -0.356, -0.567, -0.144, -0.197, -0.144, -0.197, -0.408, -0.62, -0.303, -0.197, -0.144, -0.567, -0.567, -0.514, 2.708, -0.408, -0.356, -0.62, -0.356, -0.567, -0.567, -0.356, -0.408, 2.92, -0.091, -0.197, -0.356, -0.408, -0.461, -0.25, -0.514, -0.303, -0.144, -0.144, -0.408, -0.673, -0.25, -0.303, -0.303, -0.408, -0.567, -0.25, -0.25, -0.197, -0.197, -0.303, -0.514, -0.514, -0.197, 2.286, -0.25, -0.62, 1.81, -0.567, -0.514, -0.356, -0.303, -0.25, -0.25, -0.091, -0.356, 2.603, -0.567, 2.814, -0.25, -0.567, -0.356, -0.514, 2.444, -0.197, -0.303, -0.567, -0.514, -0.567, -0.25, -0.356, 2.603, -0.356, -0.303, -0.408, -0.197, -0.673, -0.25, -0.62, -0.303, -0.25, 3.025, -0.303, -0.62, -0.461, -0.461, -0.514, -0.567, -0.514, -0.461, -0.514, -0.356, -0.514, -0.514, -0.144, -0.567, -0.303, -0.144, 2.497, -0.303, -0.461, 2.603, 2.656, 2.814, -0.091, -0.303, -0.567, 2.074, -0.514, -0.091, -0.461, -0.356, -0.197, -0.303, -0.567, 1.388, -0.673, -0.144, -0.461, -0.567, -0.25, -0.567, -0.461, -0.144, -0.567, -0.197, 1.757, 2.444, -0.303, -0.461, -0.356, -0.197, -0.62, 2.391, 2.867, -0.25, -0.25, -0.144, -0.62, -0.62, -0.25, -0.197, -0.197, -0.514, -0.303, -0.673, -0.461, -0.197, -0.408, -0.514, -0.673, -0.461, -0.303, -0.408, -0.408, -0.408, -0.567, -0.303, -0.514, -0.197, -0.144, -0.197, -0.62, 2.127, -0.461, -0.567, -0.303, -0.144, 2.814, -0.461, -0.356, 2.127, -0.62, -0.408, 2.286, -0.461, -0.62, -0.461, -0.408, -0.144, -0.408, -0.303, -0.25, -0.25, -0.197, -0.197, -0.303, -0.144, -0.408, -0.673, -0.461, -0.197, 1.599, -0.461, -0.62, -0.303, 2.55, -0.197, -0.514, -0.303, -0.514, -0.62, -0.62, -0.62, -0.673)
fifaRaw <- c(fifaRaw, -0.408, -0.567, 2.603, -0.197, -0.144, -0.356, -0.25, -0.62, 3.025, -0.62, -0.567, 2.708, -0.62, -0.144, -0.197, -0.567, -0.25, -0.197, 2.708, -0.197, -0.408, -0.673, -0.303, 2.391, 2.233, -0.197, -0.303, -0.461, -0.62, 2.497, -0.303, -0.408, 2.444, -0.408, -0.567, -0.25, -0.197, -0.303, -0.25, -0.408, -0.514, -0.408, -0.197, -0.356, -0.408, -0.408, -0.25, -0.62, -0.567, -0.567, -0.25, 2.761, -0.461, -0.197, -0.514, -0.356, -0.25, -0.461, -0.303, -0.408, 2.814, -0.303, -0.673, 2.55, -0.356, -0.356, -0.144, -0.356, 1.863, -0.091, -0.567, -0.514, -0.567, -0.303, -0.514, -0.197, -0.144, -0.408, -0.514, -0.62, -0.303, -0.62, -0.144, 2.339, -0.62, -0.62, -0.408, 3.025, -0.567, -0.197, -0.514, -0.303, -0.567, -0.356, -0.25, -0.461, -0.356, -0.62, -0.356, -0.408, 2.603, -0.62, 2.391, -0.356, -0.461, -0.144, -0.356, -0.673, -0.303, -0.461, -0.303, -0.567, -0.303, -0.25, -0.673, 2.761, -0.62, -0.408, -0.303, -0.62, -0.461, -0.567, 2.127, -0.303, -0.25, -0.673, -0.197, -0.408, -0.144, 1.652, -0.514, -0.567, -0.303, -0.303, -0.514, -0.408, -0.197, 2.444, -0.197, -0.144, -0.303, -0.461, -0.303, -0.144, -0.25, -0.62, -0.408, -0.197, -0.197, -0.144, -0.567, -0.197, -0.567, 2.074, -0.303, -0.144, -0.356, -0.25, -0.303, 1.863, -0.567, 2.761, -0.62, -0.62, -0.25, -0.356, -0.514, -0.514, -0.197, -0.197, -0.567, -0.567, -0.461, -0.461, -0.408, -0.408, -0.408, 2.656, -0.356, -0.567, -0.567, -0.356, 3.237, 2.761, -0.567, -0.303, -0.461, -0.303, -0.356, -0.144, -0.144, -0.461, -0.356, -0.567, -0.62, -0.514, -0.356, 2.814, -0.356, -0.356, -0.25, -0.567, -0.197, 2.286, 2.603, 2.603, -0.567, -0.461, -0.356, -0.461, -0.303, -0.091, -0.514, -0.514, -0.144, -0.144, -0.197, -0.197, -0.567, -0.567, -0.197, -0.356, 2.339, -0.461, -0.144, -0.408, -0.197, -0.461, -0.567, -0.36, -0.308, -0.36, 2.239, -0.672, 3.487, -0.204, -0.308, -0.62, -0.412, -0.568, -0.256, -0.204, -0.204, -0.464, -0.412, -0.568, -0.36, -0.1, -0.412, 2.135, -0.62, 2.447, -0.152, -0.308, 2.447, -0.62, -0.308, -0.256, -0.412, -0.464, -0.412, -0.516, -0.204, 3.435, -0.308, -0.516, -0.256, -0.36, -0.464, -0.256, -0.464, 2.551, -0.568, -0.412, -0.568, -0.204, -0.152, 2.603, -0.308, -0.62, -0.204, -0.204, 3.123, -0.62, 2.915, -0.1, -0.36, 1.771, -0.256, -0.36, -0.256, -0.464, -0.36, -0.568, -0.412, -0.36, -0.568, -0.568, -0.412, -0.204, -0.464, -0.204, -0.412, -0.204, 2.811, -0.412, -0.516, -0.36, -0.464, -0.308, -0.152, -0.308, -0.204, 3.019, -0.464, -0.152, -0.256, -0.412, -0.412, -0.204, -0.568, -0.308, -0.256, -0.568, -0.204, -0.308, -0.36, -0.568, -0.464, -0.308, -0.516, -0.308, -0.568, -0.256, -0.204, -0.516, -0.204, -0.516, -0.1, 2.083, -0.568, -0.568, 2.187, -0.568, -0.568, -0.256, -0.62, -0.204, -0.36, -0.256, -0.256, 2.759, -0.152, 2.499, -0.62, -0.204, -0.62)
fifaRaw <- c(fifaRaw, -0.36, 2.499, -0.568, -0.412, -0.412, -0.204, -0.308, -0.256, -0.256, 2.187, -0.412, -0.62, -0.672, -0.204, -0.568, -0.464, -0.204, -0.256, -0.568, 3.071, -0.256, -0.256, -0.516, -0.308, -0.152, -0.1, -0.516, -0.308, -0.568, -0.308, -0.568, -0.308, -0.36, -0.516, -0.1, -0.412, 2.707, -0.256, -0.412, 2.967, 2.655, 2.499, -0.308, -0.568, -0.256, 2.499, -0.308, -0.152, -0.516, -0.256, -0.36, -0.204, -0.672, 1.668, -0.516, -0.152, -0.1, -0.36, -0.256, -0.62, -0.464, -0.568, -0.516, -0.308, 1.875, 2.395, -0.204, -0.568, -0.516, -0.464, -0.308, 2.551, 2.759, -0.516, -0.36, -0.62, -0.204, -0.204, -0.464, -0.308, -0.568, -0.516, -0.36, -0.412, -0.152, -0.464, -0.412, -0.36, -0.464, -0.412, -0.256, -0.412, -0.204, -0.256, -0.36, -0.308, -0.412, -0.204, -0.464, -0.36, -0.204, 1.771, -0.464, -0.62, -0.568, -0.36, 2.603, -0.464, -0.256, 2.135, -0.412, -0.204, 2.395, -0.62, -0.62, -0.516, -0.36, -0.204, -0.308, -0.464, -0.308, -0.62, -0.568, -0.204, -0.568, -0.464, -0.412, -0.62, -0.568, -0.204, 1.46, -0.412, -0.204, -0.36, 2.343, -0.412, -0.464, -0.36, -0.36, -0.256, -0.516, -0.308, -0.204, -0.152, -0.204, 3.071, -0.1, -0.308, -0.516, -0.256, -0.1, 1.979, -0.152, -0.152, 2.499, -0.36, -0.36, -0.412, -0.36, -0.204, -0.1, 2.291, -0.308, -0.152, -0.308, -0.36, 2.187, 2.395, -0.464, -0.464, -0.308, -0.256, 2.343, -0.412, -0.568, 2.343, -0.516, -0.568, -0.256, -0.308, -0.412, -0.516, -0.62, -0.464, -0.308, -0.568, -0.464, -0.152, -0.464, -0.62, -0.464, -0.464, -0.204, -0.672, 3.019, -0.62, -0.204, -0.256, -0.568, -0.36, -0.412, -0.308, -0.412, 2.655, -0.464, -0.672, 2.863, -0.568, -0.308, -0.412, -0.204, 1.927, -0.1, -0.36, -0.62, -0.516, -0.568, -0.256, -0.516, -0.516, -0.152, -0.62, -0.568, -0.204, -0.464, -0.152, 2.447, -0.36, -0.36, -0.36, 2.655, -0.412, -0.308, -0.412, -0.204, -0.36, -0.412, -0.412, -0.412, -0.568, -0.516, -0.256, -0.62, 2.759, -0.516, 2.603, -0.36, -0.36, -0.204, -0.152, -0.464, -0.204, -0.1, -0.308, -0.412, -0.308, -0.36, -0.516, 2.915, -0.62, -0.204, -0.412, -0.62, -0.308, -0.152, 2.187, -0.568, -0.36, -0.308, -0.308, -0.256, -0.464, 1.875, -0.516, -0.516, -0.62, -0.568, -0.36, -0.36, -0.1, 2.811, -0.672, -0.204, -0.568, -0.412, -0.516, -0.464, -0.308, -0.36, -0.308, -0.36, -0.36, -0.204, -0.36, -0.62, -0.516, 1.927, -0.256, -0.62, -0.62, -0.62, -0.568, 2.187, -0.568, 2.603, -0.62, -0.568, -0.308, -0.62, -0.516, -0.152, -0.256, -0.36, -0.1, -0.464, -0.256, -0.308, -0.568, -0.308, -0.516, 2.759, -0.36, -0.412, -0.568, -0.412, 3.123, 2.759, -0.308, -0.36, -0.152, -0.204, -0.62, -0.516, -0.204, -0.152, -0.152, -0.516, -0.62, -0.516, -0.256, 2.863, -0.412, -0.412, -0.36, -0.412, -0.204, 2.395, 2.135, 2.499, -0.36, -0.464, -0.204, -0.308, -0.568, -0.412, -0.256, -0.516, -0.256)
fifaRaw <- c(fifaRaw, -0.152, -0.568, -0.672, -0.62, -0.516, -0.308, -0.568, 1.823, -0.36, -0.516, -0.36, -0.464, -0.568, -0.256, -0.077, -0.31, -0.205, -0.373, -0.371, 1.509, -0.278, -0.36, -0.282, -0.251, -0.378, -0.26, -0.319, -0.273, 1.145, -0.382, -0.346, 2.421, -0.205, -0.389, -0.373, -0.328, -0.328, -0.395, -0.31, -0.367, -0.282, -0.278, -0.305, 0.47, -0.389, -0.375, -0.333, -0.314, 1.236, -0.356, -0.333, -0.077, -0.223, 0.123, 0.871, -0.205, -0.292, -0.205, 0.214, 0.871, 0.178, -0.241, -0.384, -0.356, 1.418, -0.26, -0.31, 0.251, -0.319, -0.041, 0.506, -0.333, -0.384, -0.187, -0.296, 0.78, -0.205, -0.187, -0.342, -0.187, -0.342, -0.364, -0.31, -0.241, -0.356, -0.31, -0.287, 13.545, -0.241, -0.187, -0.132, -0.346, -0.278, 1.145, 0.014, -0.305, -0.282, -0.342, -0.323, -0.319, -0.391, -0.373, -0.364, -0.114, 0.397, -0.314, -0.255, -0.282, -0.205, -0.205, -0.328, 0.78, -0.273, -0.278, -0.241, -0.132, -0.223, -0.382, -0.278, -0.362, -0.382, -0.292, -0.223, 0.324, -0.376, -0.041, 0.689, -0.378, -0.301, 0.689, -0.333, -0.373, -0.305, -0.187, -0.351, -0.187, -0.077, 0.251, -0.367, -0.391, 0.087, 1.236, 0.397, -0.342, -0.314, -0.369, -0.333, -0.333, -0.251, -0.346, -0.346, -0.369, -0.391, -0.255, 0.16, -0.228, 5.339, 0.36, -0.354, -0.077, -0.023, 0.306, -0.004, -0.301, -0.356, -0.296, -0.273, -0.077, 0.506, -0.205, -0.287, 0.251, -0.187, 0.214, -0.237, -0.168, -0.337, -0.282, -0.278, -0.223, -0.333, 0.196, -0.323, -0.328, -0.373, 0.269, -0.351, -0.354, -0.31, -0.337, -0.187, -0.319, -0.346, -0.328, -0.346, -0.4, -0.296, -0.059, -0.31, -0.168, -0.077, -0.323, -0.382, -0.375, -0.168, -0.114, -0.395, -0.31, -0.384, -0.292, -0.187, -0.296, 0.488, -0.232, -0.132, -0.387, -0.319, -0.273, -0.333, -0.36, -0.114, -0.273, 0.36, 0.415, -0.187, -0.305, 0.251, -0.385, 1.509, -0.319, -0.223, -0.223, -0.346, -0.387, -0.15, -0.382, -0.319, -0.391, 0.78, -0.205, -0.205, -0.301, -0.278, -0.376, -0.342, -0.305, -0.264, 1.418, -0.041, -0.187, 0.36, -0.358, 0.16, -0.319, -0.301, -0.31, 0.597, -0.358, -0.041, -0.319, -0.278, -0.38, -0.387, -0.323, 0.178, -0.041, -0.205, -0.023, -0.292, -0.337, 0.397, -0.251, -0.395, -0.387, -0.38, -0.378, -0.282, -0.387, -0.228, -0.278, -0.228, -0.246, 2.695, -0.269, -0.364, 5.795, 1.236, 0.306, -0.333, -0.36, -0.384, -0.237, 1.053, -0.395, -0.342, 0.178, -0.323, 0.379, 1.6, -0.333, 1.236, -0.077, 0.597, -0.333, -0.168, -0.241, 0.78, -0.367, -0.393, -0.328, -0.387, 0.506, 0.506, -0.342, -0.382, 6.251, -0.337, -0.323, -0.278, -0.269, 0.324, 4.609, -0.26, -0.296, -0.132, -0.246, -0.273, 0.597, -0.205, -0.351, -0.391, -0.041, -0.223, -0.36, -0.351, 0.105, 0.269, -0.296, 0.488, 2.695, -0.385, 1.783, -0.205, -0.376)
fifaRaw <- c(fifaRaw, -0.287, -0.223, -0.337, -0.333, -0.096, -0.269, -0.353, -0.246, -0.305, -0.384, -0.356, 0.78, -0.246, 0.142, -0.26, -0.205, -0.384, -0.228, 0.196, -0.31, 1.145, -0.38, -0.205, 0.324, -0.323, -0.387, -0.319, -0.241, -0.36, -0.393, 0.214, 0.032, -0.367, -0.223, -0.15, -0.353, 0.287, 1.874, -0.31, -0.26, -0.187, -0.36, -0.351, -0.251, -0.241, -0.376, -0.337, 0.196, -0.059, -0.38, -0.269, -0.342, -0.237, -0.328, -0.273, -0.38, 0.105, -0.168, -0.114, 0.397, -0.389, 0.032, -0.31, -0.38, 1.053, 0.105, -0.077, 0.962, 1.053, -0.389, -0.395, 0.78, -0.369, -0.365, -0.282, -0.264, -0.342, 0.689, -0.223, -0.305, -0.059, -0.395, 0.251, -0.292, -0.096, 0.452, -0.369, -0.255, -0.264, 2.421, -0.314, -0.337, -0.31, -0.168, -0.396, -0.351, -0.023, -0.269, 0.433, -0.362, -0.378, -0.282, -0.36, -0.382, -0.232, -0.278, 0.306, -0.255, 0.689, 0.014, -0.168, -0.328, -0.314, 0.324, -0.365, -0.369, -0.337, 0.597, -0.282, -0.328, -0.287, -0.228, -0.023, 0.871, 0.069, -0.365, -0.354, -0.323, 0.306, -0.396, -0.378, -0.26, -0.273, 0.689, -0.301, -0.393, 0.306, -0.292, -0.132, -0.31, 0.597, -0.369, -0.255, -0.351, -0.358, -0.337, -0.358, 8.895, 0.306, -0.319, 0.178, -0.393, -0.077, -0.31, -0.38, 0.36, 0.105, -0.337, -0.205, -0.337, -0.375, 1.509, -0.205, -0.389, 0.397, -0.389, 0.105, 1.418, -0.328, -0.365)

fifa19mtx <- matrix(data=fifaRaw, ncol=37, nrow=500, byrow=FALSE)
fifa19_scaled <- as.data.frame(fifa19mtx)
names(fifa19_scaled) <- c('Age', 'Potential', 'Crossing', 'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys', 'Dribbling', 'Curve', 'FKAccuracy', 'LongPassing', 'BallControl', 'Acceleration', 'SprintSpeed', 'Agility', 'Reactions', 'Balance', 'ShotPower', 'Jumping', 'Stamina', 'Strength', 'LongShots', 'Aggression', 'Interceptions', 'Positioning', 'Vision', 'Penalties', 'Composure', 'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving', 'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes', 'PlayerValue')
str(fifa19_scaled)
## 'data.frame':    500 obs. of  37 variables:
##  $ Age            : num  0.569 -1.555 -0.068 -0.705 -1.342 ...
##  $ Potential      : num  -0.198 1.373 -0.023 -0.198 -0.023 ...
##  $ Crossing       : num  0.618 0.095 0.409 -1.841 0.042 ...
##  $ Finishing      : num  1.232 0.475 1.081 -1.242 -0.636 ...
##  $ HeadingAccuracy: num  0.467 0.189 0.801 -1.867 -0.033 ...
##  $ ShortPassing   : num  0.061 0.26 0.458 -2.386 0.26 ...
##  $ Volleys        : num  1.011 0.622 0.733 -1.488 -0.044 ...
##  $ Dribbling      : num  0.824 0.302 0.824 -1.837 0.093 ...
##  $ Curve          : num  0.902 0.902 0.635 -1.657 -0.164 ...
##  $ FKAccuracy     : num  1.237 0.747 0.747 -1.432 -0.015 ...
##  $ LongPassing    : num  -0.572 0.269 0.01 -2.124 0.463 ...
##  $ BallControl    : num  0.613 0.213 0.556 -2.13 -0.358 ...
##  $ Acceleration   : num  1.784 0.282 1 -2.003 -0.044 ...
##  $ SprintSpeed    : num  1.292 0.086 0.823 -2.057 -0.048 ...
##  $ Agility        : num  1.582 0.508 1.247 -2.515 -0.298 ...
##  $ Reactions      : num  0.662 -0.582 0.21 -0.921 -1.034 ...
##  $ Balance        : num  1.737 0.969 1.249 -0.705 0.341 ...
##  $ ShotPower      : num  1.18 0.573 0.628 -1.854 -0.144 ...
##  $ Jumping        : num  0.881 -0.454 0.714 -2.038 -0.454 ...
##  $ Stamina        : num  1.44 0.202 -0.151 -2.095 -0.799 ...
##  $ Strength       : num  -0.138 -0.787 -0.381 -2.164 -1.597 ...
##  $ LongShots      : num  1.229 0.426 0.627 -1.481 -0.377 ...
##  $ Aggression     : num  -0.79 0.142 -1.373 -2.189 0.434 ...
##  $ Interceptions  : num  -1.686 0.133 -0.537 -1.207 0.277 ...
##  $ Positioning    : num  1.227 0.436 0.881 -1.738 -0.848 ...
##  $ Vision         : num  0.517 0.517 0.731 -1.907 0.374 ...
##  $ Penalties      : num  1.026 0.712 0.712 -1.606 0.086 ...
##  $ Composure      : num  0.801 0.31 -0.673 -1.573 -0.918 ...
##  $ Marking        : num  0.232 -0.472 -0.522 -1.428 0.232 ...
##  $ StandingTackle : num  -1.551 -0.023 -1.506 -1.281 0.382 ...
##  $ SlidingTackle  : num  -1.516 0.053 -1.331 -1.377 0.837 ...
##  $ GKDiving       : num  -0.156 -0.523 -0.418 2.156 -0.261 ...
##  $ GKHandling     : num  -0.485 -0.323 -0.216 1.987 -0.592 ...
##  $ GKKicking      : num  -0.525 -0.25 -0.25 1.999 -0.36 ...
##  $ GKPositioning  : num  -0.567 -0.356 -0.144 2.074 -0.408 ...
##  $ GKReflexes     : num  -0.36 -0.308 -0.36 2.239 -0.672 ...
##  $ PlayerValue    : num  -0.077 -0.31 -0.205 -0.373 -0.371 ...
# Glimpse at the dataset
glimpse(fifa19_scaled)
## Observations: 500
## Variables: 37
## $ Age             <dbl> 0.569, -1.555, -0.068, -0.705, -1.342, -0.280, -0.0...
## $ Potential       <dbl> -0.198, 1.373, -0.023, -0.198, -0.023, 2.246, -0.19...
## $ Crossing        <dbl> 0.618, 0.095, 0.409, -1.841, 0.042, -1.789, -0.586,...
## $ Finishing       <dbl> 1.232, 0.475, 1.081, -1.242, -0.636, -1.646, -0.283...
## $ HeadingAccuracy <dbl> 0.467, 0.189, 0.801, -1.867, -0.033, -2.200, 0.690,...
## $ ShortPassing    <dbl> 0.061, 0.260, 0.458, -2.386, 0.260, -2.121, -0.137,...
## $ Volleys         <dbl> 1.011, 0.622, 0.733, -1.488, -0.044, -1.599, -0.877...
## $ Dribbling       <dbl> 0.824, 0.302, 0.824, -1.837, 0.093, -2.150, -1.107,...
## $ Curve           <dbl> 0.902, 0.902, 0.635, -1.657, -0.164, -1.870, -0.804...
## $ FKAccuracy      <dbl> 1.237, 0.747, 0.747, -1.432, -0.015, -1.649, -0.887...
## $ LongPassing     <dbl> -0.572, 0.269, 0.010, -2.124, 0.463, -1.801, -0.572...
## $ BallControl     <dbl> 0.613, 0.213, 0.556, -2.130, -0.358, -2.073, -0.530...
## $ Acceleration    <dbl> 1.784, 0.282, 1.000, -2.003, -0.044, -1.481, -1.220...
## $ SprintSpeed     <dbl> 1.292, 0.086, 0.823, -2.057, -0.048, -1.789, -1.186...
## $ Agility         <dbl> 1.582, 0.508, 1.247, -2.515, -0.298, -1.709, -0.366...
## $ Reactions       <dbl> 0.662, -0.582, 0.210, -0.921, -1.034, -0.017, -0.35...
## $ Balance         <dbl> 1.737, 0.969, 1.249, -0.705, 0.341, -1.403, -0.008,...
## $ ShotPower       <dbl> 1.180, 0.573, 0.628, -1.854, -0.144, -1.964, -0.695...
## $ Jumping         <dbl> 0.881, -0.454, 0.714, -2.038, -0.454, 0.714, 0.881,...
## $ Stamina         <dbl> 1.440, 0.202, -0.151, -2.095, -0.799, -1.153, 0.556...
## $ Strength        <dbl> -0.138, -0.787, -0.381, -2.164, -1.597, -0.868, 0.7...
## $ LongShots       <dbl> 1.229, 0.426, 0.627, -1.481, -0.377, -1.481, -0.226...
## $ Aggression      <dbl> -0.790, 0.142, -1.373, -2.189, 0.434, -1.315, 0.725...
## $ Interceptions   <dbl> -1.686, 0.133, -0.537, -1.207, 0.277, -1.351, 0.516...
## $ Positioning     <dbl> 1.227, 0.436, 0.881, -1.738, -0.848, -1.688, -1.293...
## $ Vision          <dbl> 0.517, 0.517, 0.731, -1.907, 0.374, -0.838, -1.123,...
## $ Penalties       <dbl> 1.026, 0.712, 0.712, -1.606, 0.086, -1.042, -0.854,...
## $ Composure       <dbl> 0.801, 0.310, -0.673, -1.573, -0.918, 0.064, -0.263...
## $ Marking         <dbl> 0.232, -0.472, -0.522, -1.428, 0.232, -1.377, 0.836...
## $ StandingTackle  <dbl> -1.551, -0.023, -1.506, -1.281, 0.382, -1.641, 1.01...
## $ SlidingTackle   <dbl> -1.516, 0.053, -1.331, -1.377, 0.837, -1.562, 0.976...
## $ GKDiving        <dbl> -0.156, -0.523, -0.418, 2.156, -0.261, 3.364, -0.62...
## $ GKHandling      <dbl> -0.485, -0.323, -0.216, 1.987, -0.592, 3.115, -0.21...
## $ GKKicking       <dbl> -0.525, -0.250, -0.250, 1.999, -0.360, 2.328, -0.52...
## $ GKPositioning   <dbl> -0.567, -0.356, -0.144, 2.074, -0.408, 3.078, -0.14...
## $ GKReflexes      <dbl> -0.360, -0.308, -0.360, 2.239, -0.672, 3.487, -0.20...
## $ PlayerValue     <dbl> -0.077, -0.310, -0.205, -0.373, -0.371, 1.509, -0.2...
coefs <- data.frame(OLS=as.vector(lm(PlayerValue ~ . -Interceptions, data=fifa19_scaled)$coef[-1]))
coefs
##            OLS
## 1   0.14080348
## 2   0.52499805
## 3   0.13608502
## 4   0.15456543
## 5  -0.01529349
## 6  -0.22318485
## 7   0.18580783
## 8  -0.29767876
## 9   0.02934133
## 10  0.05287707
## 11  0.17011601
## 12  0.17507295
## 13  0.06438190
## 14  0.03084774
## 15 -0.01369804
## 16  0.19450983
## 17  0.02776689
## 18 -0.10462695
## 19 -0.08338769
## 20 -0.07010475
## 21  0.02518615
## 22  0.00198291
## 23 -0.01617527
## 24 -0.12197746
## 25 -0.05634959
## 26 -0.03740878
## 27 -0.04404595
## 28 -0.04982907
## 29  0.20388990
## 30 -0.09861257
## 31  0.11197957
## 32  0.08476926
## 33  0.14563308
## 34 -0.30283233
## 35 -0.10014565
# Ridge regression: mdlRidge
mdlRidge <- caret::train(PlayerValue ~ ., data = fifa19_scaled, method = "ridge", tuneLength = 8)
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked _by_ '.GlobalEnv':
## 
##     barley
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
# Plot ridge train object
plot(mdlRidge)

# Ridge regression coefficients
coefRidge <- predict(mdlRidge$finalModel, type='coef', mode='norm')$coefficients
coefs$RidgeAll <- coefRidge[nrow(coefRidge),]
print(coefs)
##            OLS     RidgeAll
## 1   0.14080348  0.063757232
## 2   0.52499805  0.453334036
## 3   0.13608502  0.067490561
## 4   0.15456543  0.060594977
## 5  -0.01529349  0.008445445
## 6  -0.22318485 -0.059108238
## 7   0.18580783  0.124150501
## 8  -0.29767876 -0.074407150
## 9   0.02934133  0.038889391
## 10  0.05287707  0.057464591
## 11  0.17011601  0.068074550
## 12  0.17507295  0.039128679
## 13  0.06438190  0.040408023
## 14  0.03084774  0.017734038
## 15 -0.01369804 -0.003036336
## 16  0.19450983  0.199081287
## 17  0.02776689  0.006185195
## 18 -0.10462695 -0.040956383
## 19 -0.08338769 -0.080734840
## 20 -0.07010475 -0.046693927
## 21  0.02518615  0.019689478
## 22  0.00198291  0.021424749
## 23 -0.01617527 -0.009860133
## 24 -0.12197746 -0.062989656
## 25 -0.05634959 -0.051736655
## 26 -0.03740878 -0.005134552
## 27 -0.04404595  0.008680042
## 28 -0.04982907 -0.003446574
## 29  0.20388990  0.078066864
## 30 -0.09861257  0.016096396
## 31  0.11197957  0.052573952
## 32  0.08476926  0.034556053
## 33  0.14563308  0.042359446
## 34 -0.30283233 -0.033414894
## 35 -0.10014565  0.008586468
# Lasso regression: mdlLasso
mdlLasso <- caret::train(PlayerValue ~ ., data = fifa19_scaled, method = "lasso", tuneLength = 8)

# Plot lasso object
plot(mdlLasso)

# Get coefficients in every step: coefLasso
coefLasso <- predict(mdlLasso$finalModel, type='coef', mode='norm')$coefficients

# Get coefficients for top 5 and all variables
(coefs$LassoTop5 <- coefLasso[6, ])
##             Age       Potential        Crossing       Finishing HeadingAccuracy 
##    0.000000e+00    3.934720e-01    0.000000e+00    0.000000e+00    0.000000e+00 
##    ShortPassing         Volleys       Dribbling           Curve      FKAccuracy 
##    0.000000e+00    6.717817e-03    0.000000e+00    0.000000e+00    6.410262e-02 
##     LongPassing     BallControl    Acceleration     SprintSpeed         Agility 
##    6.695871e-06    0.000000e+00    0.000000e+00    0.000000e+00    0.000000e+00 
##       Reactions         Balance       ShotPower         Jumping         Stamina 
##    2.081593e-01    0.000000e+00    0.000000e+00    0.000000e+00    0.000000e+00 
##        Strength       LongShots      Aggression     Positioning          Vision 
##    0.000000e+00    0.000000e+00    0.000000e+00    0.000000e+00    0.000000e+00 
##       Penalties       Composure         Marking  StandingTackle   SlidingTackle 
##    0.000000e+00    0.000000e+00    0.000000e+00    0.000000e+00    0.000000e+00 
##        GKDiving      GKHandling       GKKicking   GKPositioning      GKReflexes 
##    0.000000e+00    0.000000e+00    0.000000e+00    0.000000e+00    0.000000e+00
(coefs$LassoAll <- coefLasso[nrow(coefLasso), ])
##             Age       Potential        Crossing       Finishing HeadingAccuracy 
##      0.14080348      0.52499805      0.13608502      0.15456543     -0.01529349 
##    ShortPassing         Volleys       Dribbling           Curve      FKAccuracy 
##     -0.22318485      0.18580783     -0.29767876      0.02934133      0.05287707 
##     LongPassing     BallControl    Acceleration     SprintSpeed         Agility 
##      0.17011601      0.17507295      0.06438190      0.03084774     -0.01369804 
##       Reactions         Balance       ShotPower         Jumping         Stamina 
##      0.19450983      0.02776689     -0.10462695     -0.08338769     -0.07010475 
##        Strength       LongShots      Aggression     Positioning          Vision 
##      0.02518615      0.00198291     -0.01617527     -0.12197746     -0.05634959 
##       Penalties       Composure         Marking  StandingTackle   SlidingTackle 
##     -0.03740878     -0.04404595     -0.04982907      0.20388990     -0.09861257 
##        GKDiving      GKHandling       GKKicking   GKPositioning      GKReflexes 
##      0.11197957      0.08476926      0.14563308     -0.30283233     -0.10014565
# ElasticNet regression: mdlElasticNet
mdlElasticNet <- caret::train(PlayerValue ~ ., data = fifa19_scaled, method = "enet", tuneLength = 8)

# Plot elastic net object
plot(mdlElasticNet)

# Get elastic net coefficients: coefElasticNet
coefElasticNet <- predict(mdlElasticNet$finalModel, type="coef", mode="norm")$coefficients

# Get coefficients for top 5 and all variables
(coefs$ElasticNetTop5 <- coefElasticNet[6, ])
##             Age       Potential        Crossing       Finishing HeadingAccuracy 
##     0.000000000     0.394474977     0.000000000     0.000000000     0.000000000 
##    ShortPassing         Volleys       Dribbling           Curve      FKAccuracy 
##     0.000000000     0.012047805     0.000000000     0.000000000     0.060546666 
##     LongPassing     BallControl    Acceleration     SprintSpeed         Agility 
##     0.006535522     0.000000000     0.000000000     0.000000000     0.000000000 
##       Reactions         Balance       ShotPower         Jumping         Stamina 
##     0.212491467     0.000000000     0.000000000     0.000000000     0.000000000 
##        Strength       LongShots      Aggression     Positioning          Vision 
##     0.000000000     0.000000000     0.000000000     0.000000000     0.000000000 
##       Penalties       Composure         Marking  StandingTackle   SlidingTackle 
##     0.000000000     0.000000000     0.000000000     0.000000000     0.000000000 
##        GKDiving      GKHandling       GKKicking   GKPositioning      GKReflexes 
##     0.000000000     0.000000000     0.000000000     0.000000000     0.000000000
(coefs$ElasticNetAll <- coefElasticNet[nrow(coefElasticNet), ])
##             Age       Potential        Crossing       Finishing HeadingAccuracy 
##     0.105432609     0.491074885     0.095533534     0.099838431    -0.001408899 
##    ShortPassing         Volleys       Dribbling           Curve      FKAccuracy 
##    -0.125702820     0.156053369    -0.151937955     0.034892424     0.054804480 
##     LongPassing     BallControl    Acceleration     SprintSpeed         Agility 
##     0.111216957     0.069901622     0.052288414     0.021620556    -0.008381661 
##       Reactions         Balance       ShotPower         Jumping         Stamina 
##     0.194134254     0.016142715    -0.074661110    -0.084036138    -0.058290987 
##        Strength       LongShots      Aggression     Positioning          Vision 
##     0.022426786     0.018131234    -0.013931411    -0.091620323    -0.057940407 
##       Penalties       Composure         Marking  StandingTackle   SlidingTackle 
##    -0.020553899    -0.018366690    -0.025703054     0.111342215    -0.012303960 
##        GKDiving      GKHandling       GKKicking   GKPositioning      GKReflexes 
##     0.060755399     0.043300637     0.061644527    -0.117459268    -0.029230937
# Fit MLP using nnet: mdlNNet
# set.seed(124)
# mdlNNet <- nnet(Class ~ ., data = pulsar_train, size = 3)

# Calculate train error: train_error
# pred_train <- predict(mdlNNet, pulsar_train, type="class")
# train_cm <- table(pred_train, pulsar_train$Class)
# (train_error <- 1 - sum(diag(train_cm)) / sum(train_cm))

# Calculate test error: test_error
# pred_test <- predict(mdlNNet, pulsar_test, type="class")
# test_cm <- table(pred_test, pulsar_test$Class)
# (test_error <- 1 - sum(diag(test_cm)) / sum(test_cm))


# Fit MLP using nnet: mdlNNet
# set.seed(124)
# mdlNNet <- nnet(Class ~ ., data = pulsar_train, size = 5)

# Calculate train error: train_error
# pred_train <- predict(mdlNNet, pulsar_train, type="class")
# train_cm <- table(pred_train, pulsar_train$Class)
# (train_error <- 1 - sum(diag(train_cm)) / sum(train_cm))

# Calculate test error: test_error
# pred_test <- predict(mdlNNet, pulsar_test, type="class")
# test_cm <- table(pred_test, pulsar_test$Class)
# (test_error <- 1 - sum(diag(test_cm)) / sum(test_cm))


# Create the 5-fold cross validation training control object
# control <- trainControl(method = "cv", number = 5, savePredictions = TRUE, classProbs = TRUE)

# Create the vector of base learners: baseLearners
# baseLearners <- c('rpart', 'glm', 'knn', 'svmRadial')

# Create and summarize the list of base learners: models
# models <- caretList(Class ~ ., data = training, trControl = control, methodList = baseLearners)
# summary(models)


# Classification results in each resample: results
# results <- resamples(models)

# Summarize and print the results in one line
# (results_summary <- summary(results))

# Show the correlation among the base learners' results
# modelCor(results)

# Display a scatter plot matrix of these results
# splom(results)


# Load caretEnsemble
# library(caretEnsemble)

# Set the seed
# set.seed(123)

# Stack the base learners
# stack.glm <- caretStack(models, method="glm", metric="Accuracy", trControl=control)

# Print the stacked model
# stack.glm

# Summarize the performance results for each base learner
# summary(results)


evaluateModel <- function(trainObject, testData) {
  # Compute binary yes/no predictions and class probabilities
  model_preds <- predict(trainObject, testData)
  model_probs <- predict(trainObject, testData, type="prob")
  # Compute accuracy and AUC values
  model_acc <- accuracy(testData$Class, model_preds)
  model_auc <- auc(testData$Class == 'yes', model_probs[, 2])
  # Return model accuracy and AUC
  c(model_acc, model_auc)
}

# Evaluate the performance of each individual base learner
# baseLearnerStats <- sapply(X=stack.glm$models, FUN=evaluateModel, testing)
# baseLearnerDF <- data.frame(baseLearnerStats, row.names = c('acc', 'auc'))

# Compute stacked ensemble's accuracy on test data
# stack_preds <- predict(stack.glm, testing)
# stack_acc <- accuracy(testing$Class, stack_preds)

# Compute stacked ensemble's AUC on test data
# stack_preds_probs <- predict(stack.glm, testing, type="prob")
# stack_auc <- auc(testing$Class == 'yes', stack_preds_probs)

# Combine the stacked ensemble results
# (allLearnersDF <- cbind(baseLearnerDF, list(stack=c(stack_acc, stack_auc))))

Chapter 3 - Unsupervised Learning

K-means Clustering:

  • K-means is a simple, intuitive, and fast-running manner for clustering
  • K-means makes several assumptions about the data
    • Numeric and continuous variables
    • Variables with symmetrics distributions
    • Variables with similar averages and standard deviations (based on Euclidean distances)
    • Skew should be dealt with - log-transform, Box-Cox, cubic root, etc.
    • Data should be scaled, typically either using z-score scaling or max/min scaling
  • Need to consider the optimal number of clusters, K
    • Optimal K will produce distinct, well-separated clusters
    • WSS (within-sum-squares) measures the compactness of the clusters
    • BSS (between-sum-squared) measures the distance between cluster centers
    • Typically, try various k within a range, then select the smallest k such that WSS / (WSS + BSS) < 0.2 - similar to the elbow method

Clustering Algorithms:

  • Selecting a clustering algorithm can be tricky since there are many possible algorithms - art as much as science
    • Every algorithms has assumptions and constraints about the input data
    • Helpful to understand the underlying clustering method
  • Representative-based clustering has centers/centroids, which are specific points that may or may not exist in the raw data - k-means, fuzzy c-means, k-medoids (PAM), expectation maximization
  • Connectivity-based methods connect objects based on their distance (furthest, closest, average, etc.) - Ward, SLINK
    • Agglomerative - bottom-up
    • Divisive - top-down
  • Density-based methods cluster based on areas of higher density in the data - DBSCAN, Optics, Mean-Shift
  • Clustering evaluation can be as tricky as cluster creation - also an art rather than a science
    • Internal validation - single metric such as compactness or separation
    • Extrenal validation - ground-truth (often not available)
    • Manual validation - human expert review
    • Indirect validation - assessment of how the clusters perform in an intended application

Feature Selection:

  • The ‘curse of dimensionality’ is common in machine learning - too many dimensions lead to sparse data
    • A lot of training data is needed - at least 5 training samples per feature is a good rule of thumb
  • Dimensionality reduction techniques work on both feature selection (same variables as original) and feature extraction (transformation on to a space of fewer dimensions)
  • Feature selection takes a subset of the features for further analysis - would require 2**N - 1 runs, so shortcuts are needed
    • Filter methods - select subsets based on internal features such as correlations with each other and/or the target variable
    • Wrapper methods - use ML methods to evaluate feature subsets, and then select the best subset (caution that overfitting can be problematic)
    • Embedded methods - combination of filter and wrapper, where the learning algorithm implicitly carries out feature selection (can often return a ranking of feature importances)
  • Feature extraction maps features on to a lower dimension, with a goal of minimizing data loss
    • New features are not interpretable, at least in the original feature space

Feature Extraction:

  • Feature selection and feature extraction can be used on the same dataset
    • If interpretability is important, prefer feature selection
    • If computational considerations are important, prefer feature extraction
    • Caution that feature selection can lead to getting stuck finding mappings to the outcome variable
  • A common question is compare PCA (principal component analysis) and LDA (linear discriminant analysis)
    • Both extract a new feature set from the original dataset
    • Both create linear combinations of the original features
    • Both work better with normalized data and work only with continuous features
    • PCA is an unsupervised method that creates component axes for maximum variance - principal components are uncorrelated and ranked by variance explained
    • LDA is a supervised method for maximizing component axes for class-separation - the linear discriminants are the directions for maximizing class separation
    • PCA and LDA can be used together - e.g., PCA for dimensionality reduction followed by LDA to identify axes for best class separation

Example code includes:

mallData <- c(19, 21, 20, 23, 31, 22, 35, 23, 64, 30, 67, 35, 58, 24, 37, 22, 35, 20, 52, 35, 35, 25, 46, 31, 54, 29, 45, 35, 40, 23, 60, 21, 53, 18, 49, 21, 42, 30, 36, 20, 65, 24, 48, 31, 49, 24, 50, 27, 29, 31, 49, 33, 31, 59, 50, 47, 51, 69, 27, 53, 70, 19, 67, 54, 63, 18, 43, 68, 19, 32, 70, 47, 60, 60, 59, 26, 45, 40, 23, 49, 57, 38, 67, 46, 21, 48, 55, 22, 34, 50, 68, 18, 48, 40, 32, 24, 47, 27, 48, 20, 23, 49, 67, 26, 49, 21, 66, 54, 68, 66, 65, 19, 38, 19, 18, 19, 63, 49, 51, 50, 27, 38, 40, 39, 23, 31, 43, 40, 59, 38, 47, 39, 25, 31, 20, 29, 44, 32, 19, 35, 57, 32, 28, 32, 25, 28, 48, 32, 34, 34, 43, 39, 44, 38, 47, 27, 37, 30, 34, 30, 56, 29, 19, 31, 50, 36, 42, 33, 36, 32, 40, 28, 36, 36, 52, 30, 58, 27, 59, 35, 37, 32, 46, 29, 41, 30, 54, 28, 41, 36, 34, 32, 33, 38, 47, 35, 45, 32, 32, 30, 15000, 15000, 16000, 16000, 17000, 17000, 18000, 18000, 19000, 19000, 19000, 19000, 20000, 20000, 20000, 20000, 21000, 21000, 23000, 23000, 24000, 24000, 25000, 25000, 28000, 28000, 28000, 28000, 29000, 29000, 30000, 30000, 33000, 33000, 33000, 33000, 34000, 34000, 37000, 37000, 38000, 38000, 39000, 39000, 39000, 39000, 40000, 40000, 40000, 40000, 42000, 42000, 43000, 43000, 43000, 43000, 44000, 44000, 46000, 46000, 46000, 46000, 47000, 47000, 48000, 48000, 48000, 48000, 48000, 48000, 49000, 49000, 50000, 50000, 54000, 54000, 54000, 54000, 54000, 54000, 54000, 54000, 54000, 54000, 54000, 54000, 57000, 57000, 58000, 58000, 59000, 59000, 60000, 60000, 60000, 60000, 60000, 60000, 61000, 61000, 62000, 62000, 62000, 62000, 62000, 62000, 63000, 63000, 63000, 63000, 63000, 63000, 64000, 64000, 65000, 65000, 65000, 65000, 67000, 67000, 67000, 67000, 69000, 69000, 70000, 70000, 71000, 71000, 71000, 71000, 71000, 71000, 72000, 72000, 73000, 73000, 73000, 73000, 74000, 74000, 75000, 75000, 76000, 76000, 77000, 77000, 77000, 77000, 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000, 79000, 79000, 81000, 81000, 85000, 85000, 86000, 86000, 87000, 87000, 87000, 87000, 87000, 87000, 88000, 88000, 88000, 88000, 93000, 93000, 97000, 97000, 98000, 98000, 99000, 99000, 101000, 101000, 103000, 103000, 103000, 103000, 113000, 113000, 120000, 120000, 126000, 126000, 137000, 137000, 39, 81, 6, 77, 40, 76, 6, 94, 3, 72, 14, 99, 15, 77, 13, 79, 35, 66, 29, 98, 35, 73, 5, 73, 14, 82, 32, 61, 31, 87, 4, 73, 4, 92, 14, 81, 17, 73, 26, 75, 35, 92, 36, 61, 28, 65, 55, 47, 42, 42, 52, 60, 54, 60, 45, 41, 50, 46, 51, 46, 56, 55, 52, 59, 51, 59, 50, 48, 59, 47, 55, 42, 49, 56, 47, 54, 53, 48, 52, 42, 51, 55, 41, 44, 57, 46, 58, 55, 60, 46, 55, 41, 49, 40, 42, 52, 47, 50, 42, 49, 41, 48, 59, 55, 56, 42, 50, 46, 43, 48, 52, 54, 42, 46, 48, 50, 43, 59, 43, 57, 56, 40, 58, 91, 29, 77, 35, 95, 11, 75, 9, 75, 34, 71, 5, 88, 7, 73, 10, 72, 5, 93, 40, 87, 12, 97, 36, 74, 22, 90, 17, 88, 20, 76, 16, 89, 1, 78, 1, 73, 35, 83, 5, 93, 26, 75, 20, 95, 27, 63, 13, 75, 10, 92, 13, 86, 15, 69, 14, 90, 32, 86, 15, 88, 39, 97, 24, 68, 17, 85, 23, 69, 8, 91, 16, 79, 28, 74, 18, 83)
mall <- as.data.frame(matrix(data=mallData, ncol=3, byrow=FALSE))
names(mall) <- c("Age", "AnnualIncome", "SpendingScore")


# Glimpse over the mall data
glimpse(mall)
## Observations: 200
## Variables: 3
## $ Age           <dbl> 19, 21, 20, 23, 31, 22, 35, 23, 64, 30, 67, 35, 58, 2...
## $ AnnualIncome  <dbl> 15000, 15000, 16000, 16000, 17000, 17000, 18000, 1800...
## $ SpendingScore <dbl> 39, 81, 6, 77, 40, 76, 6, 94, 3, 72, 14, 99, 15, 77, ...
# Display the range of every variable
sapply(mall, range)
##      Age AnnualIncome SpendingScore
## [1,]  18        15000             1
## [2,]  70       137000            99
# Age histogram 
hist(mall$Age, breaks=10)

# Spending score histogram 
hist(mall$SpendingScore, breaks=10)

# Annual income histogram 
hist(mall$AnnualIncome, breaks=10)

mall_scaled <- scale(mall)


# Initialize vector: ratios
ratios <- rep(0, 10)

# Try different values of K
for (k in 1:10) {
    # Cluster mall: mall_c
    mall_c <- kmeans(mall_scaled, k, nstart=20)
    # Save the ratio WSS/TSS in the kth position of ratios
    ratios[k] <- mall_c$tot.withinss / mall_c$totss
}

# Line plot with ratios as a function of k
plot(ratios, type="b", xlab="number of clusters")

# Cluster mall_scaled data using k = 6: mall_6
set.seed(123)
mall_6 <- kmeans(mall_scaled, centers=6, nstart=20)

# Average each variable per cluster
mall %>%
    mutate(cluster = mall_6$cluster) %>%
    group_by(cluster) %>%
    summarize_all(list(~mean(.)))
## # A tibble: 6 x 4
##   cluster   Age AnnualIncome SpendingScore
##     <int> <dbl>        <dbl>         <dbl>
## 1       1  56.3       54267.          49.1
## 2       2  41.9       88939.          17.0
## 3       3  32.7       86538.          82.1
## 4       4  26.7       57579.          47.8
## 5       5  25.2       25833.          76.9
## 6       6  45.5       26286.          19.4
library(clValid)
## Loading required package: cluster
# Create the list of clustering methods: methods
methods <- c("hierarchical", "kmeans", "pam")

# Compare clustering methods: results
results <- clValid::clValid(mall_scaled, 2:10, clMethods = methods, validation = "internal")
## Warning in clValid::clValid(mall_scaled, 2:10, clMethods = methods, validation =
## "internal"): rownames for data not specified, using 1:nrow(data)
# Summarize the results
summary(results)
## 
## Clustering Methods:
##  hierarchical kmeans pam 
## 
## Cluster sizes:
##  2 3 4 5 6 7 8 9 10 
## 
## Validation Measures:
##                                  2       3       4       5       6       7       8       9      10
##                                                                                                   
## hierarchical Connectivity  11.5218 16.0488 16.0488 21.6802 24.4683 30.0873 37.3337 42.8595 50.8476
##              Dunn           0.0920  0.0926  0.1007  0.1216  0.1216  0.1262  0.1262  0.1538  0.1304
##              Silhouette     0.3249  0.3400  0.3839  0.4096  0.3896  0.3800  0.3756  0.4069  0.3954
## kmeans       Connectivity  15.2317 29.3147 34.4337 33.1044 34.9714 38.8690 45.2663 54.4786 65.6397
##              Dunn           0.0596  0.0445  0.0593  0.0659  0.0554  0.0660  0.0673  0.1151  0.1452
##              Silhouette     0.3355  0.3503  0.4040  0.4166  0.4274  0.4298  0.4171  0.4156  0.4002
## pam          Connectivity  40.4341 23.7587 31.8710 32.8016 36.6397 42.2599 49.5706 64.0647 64.4107
##              Dunn           0.0383  0.0683  0.0551  0.1005  0.0554  0.0554  0.0741  0.0660  0.0450
##              Silhouette     0.3161  0.3588  0.4004  0.3667  0.4253  0.4137  0.3798  0.3735  0.3866
## 
## Optimal Scores:
## 
##              Score   Method       Clusters
## Connectivity 11.5218 hierarchical 2       
## Dunn          0.1538 hierarchical 9       
## Silhouette    0.4298 kmeans       7
# Create the list of clustering methods: methods
methods <- c("hierarchical", "kmeans", "pam")

# Compare clustering methods: results
results <- clValid(mall_scaled, 2:10, clMethods = methods, validation = "stability")
## Warning in clValid(mall_scaled, 2:10, clMethods = methods, validation =
## "stability"): rownames for data not specified, using 1:nrow(data)
# Summarize the results
summary(results)
## 
## Clustering Methods:
##  hierarchical kmeans pam 
## 
## Cluster sizes:
##  2 3 4 5 6 7 8 9 10 
## 
## Validation Measures:
##                        2      3      4      5      6      7      8      9     10
##                                                                                 
## hierarchical APN  0.1064 0.1838 0.2506 0.2997 0.3282 0.3631 0.3804 0.3376 0.3452
##              AD   2.1956 1.8880 1.7034 1.6045 1.5477 1.4783 1.4540 1.3322 1.3060
##              ADM  1.0162 0.8303 0.8859 0.9028 0.9009 0.9023 0.8922 0.8005 0.7830
##              FOM  0.9942 0.9878 0.9549 0.9468 0.9361 0.9313 0.9324 0.8960 0.8934
## kmeans       APN  0.2547 0.2913 0.3127 0.3659 0.3464 0.3407 0.3480 0.3740 0.3924
##              AD   2.0424 1.8111 1.6195 1.5768 1.4826 1.4025 1.3432 1.3095 1.2830
##              ADM  0.7833 0.8359 0.8218 0.9040 0.9127 0.8658 0.8289 0.8232 0.8005
##              FOM  0.9959 0.9791 0.9484 0.9427 0.9327 0.9240 0.9002 0.8893 0.8939
## pam          APN  0.2095 0.3028 0.2872 0.3451 0.3816 0.3705 0.3732 0.4302 0.4512
##              AD   1.9793 1.8171 1.5905 1.5410 1.4516 1.3800 1.3369 1.3232 1.2558
##              ADM  0.6366 0.8874 0.7664 0.8463 0.8339 0.7942 0.7569 0.7856 0.7631
##              FOM  0.9779 0.9789 0.9350 0.9386 0.9330 0.9371 0.9264 0.9328 0.8989
## 
## Optimal Scores:
## 
##     Score  Method       Clusters
## APN 0.1064 hierarchical 2       
## AD  1.2558 pam          10      
## ADM 0.6366 pam          2       
## FOM 0.8893 kmeans       9
# Plot 3D mall_scaled data
plot3D::scatter3D(x = mall_scaled[, 1], y = mall_scaled[, 2], z = mall_scaled[, 3], col = "blue")

# Get K-means centroids for K = 7 and add them to the plot
km_centers <- results@clusterObjs$kmeans$`7`$centers
plot3D::points3D(km_centers[, 1], km_centers[, 2], km_centers[, 3], col = "red", pch=20, add=TRUE, cex=2.5)

# Get PAM's medoids for K = 7 and add them to the plot
pam_idxs <- results@clusterObjs$pam$'7'$medoids
pam_med <- mall_scaled[pam_idxs, ]
plot3D::points3D(pam_med[, 1], pam_med[, 2], pam_med[, 3], col = "green", pch=20, add=TRUE, cex=2.5)

appsOld <- apps


apps <- appsOld %>%
    select(Rating, Reviews, Installs, Type, Price, `Content Rating`) %>%
    rename(Content=`Content Rating`) %>%
    mutate(HasPositiveReviews=TRUE, Price=as.numeric(gsub('\\$', '', Price))) %>%
    filter(complete.cases(.))


# Glimpse at the data
glimpse(apps)
## Observations: 9,366
## Variables: 7
## $ Rating             <dbl> 4.1, 3.9, 4.7, 4.5, 4.3, 4.4, 3.8, 4.1, 4.4, 4.7...
## $ Reviews            <dbl> 159, 967, 87510, 215644, 967, 167, 178, 36815, 1...
## $ Installs           <chr> "10,000+", "500,000+", "5,000,000+", "50,000,000...
## $ Type               <chr> "Free", "Free", "Free", "Free", "Free", "Free", ...
## $ Price              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ Content            <chr> "Everyone", "Everyone", "Everyone", "Teen", "Eve...
## $ HasPositiveReviews <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, ...
# Identify near-zero-variance predictors: nzv
nzv <- caret::nearZeroVar(apps, names=TRUE)
print(nzv)
## [1] "Price"              "HasPositiveReviews"
# Frequency of the HasPositiveReviews attribute
table(apps$HasPositiveReviews)
## 
## TRUE 
## 9366
# Frequency of the Price attribute
table(apps$Price)
## 
##      0   0.99      1    1.2   1.29   1.49    1.5   1.59   1.61    1.7   1.75 
##   8719    107      2      1      1     31      1      1      1      2      1 
##   1.76   1.97   1.99      2   2.49    2.5   2.56   2.59    2.9   2.95   2.99 
##      1      1     59      1     21      1      1      1      1      1    114 
##   3.02   3.04   3.08   3.28   3.49   3.88    3.9   3.95   3.99   4.29   4.49 
##      1      1      1      1      7      1      1      2     58      1      9 
##   4.59    4.6   4.77   4.84   4.99   5.49   5.99   6.49   6.99   7.49   7.99 
##      1      1      1      1     70      3     18      1     13      2      7 
##   8.49   8.99      9   9.99     10  10.99  11.99  12.99  13.99     14  14.99 
##      1      4      2     16      3      2      5      5      1      1     10 
##  15.46  15.99  16.99  17.99  18.99   19.4  19.99  24.99  29.99  33.99  37.99 
##      1      1      3      2      1      1      5      5      6      2      1 
##  39.99  79.99 299.99 379.99 389.99 399.99    400 
##      1      2      1      1      1     11      1
# Remove these features: apps_clean
apps_clean <- apps %>% 
    select(-HasPositiveReviews, -Price)


# Glimpse at the fifa data
# glimpse(fifa)

# Are there zero or near-zero variance features?
# nearZeroVar(fifa)

# Highly correlated predictors: cor_90plus
# (cor_90plus <- findCorrelation(cor(fifa), names = TRUE))

# Highly correlated predictors (>= 98%): cor_98plus
# (cor_98plus <- findCorrelation(cor(fifa), names = TRUE, cutoff = 0.98))

# Remove cor_90plus features: fifa_clean
# fifa_clean <- fifa %>% 
#     select(-cor_90plus)


# Train model on original scaled data: mdl_orig
# mdl_orig <- train(Club ~ ., data = team_train, method="svmLinear2", trControl = trainCtrl)

# Predict on original test data: orig_preds, orig_probs
# orig_preds <- predict(mdl_orig, team_test)
# orig_probs <- predict(mdl_orig, team_test, type="prob")

# Compute and print the confusion matrix: cm_orig
# (cm_orig <- confusionMatrix(orig_preds, team_test$Club))

# Compute and print AUC: auc_orig
# (auc_orig <- auc(team_test$Club == 'Real.Madrid', orig_probs$'Real.Madrid'))


# Transform training and test data: train_pca, test_pca
# pca <- preProcess(x = team_train[, -match("Club", names(team_train))], method = "pca")
# train_pca <- predict(pca, team_train)
# test_pca <- predict(pca, team_test)

# Train model on PCA data: mdl_pca
# mdl_pca <- train(Club ~ ., data = train_pca, method = "svmLinear2", trControl = trainCtrl)

# Predict on PCA data: pca_preds, pca_probs
# pca_preds <- predict(mdl_pca, test_pca)
# pca_probs <- predict(mdl_pca, test_pca, type = "prob")

# Compute and print confusion matrix & AUC: cm_pca, auc_pca
# (cm_pca <- confusionMatrix(pca_preds, test_pca$Club))
# (auc_pca <- auc(test_pca$Club == 'Real.Madrid', pca_probs$'Real.Madrid'))


# Transform training and test data: train_lda, test_lda
# my_lda <- lda(Club ~ ., data = team_train)
# train_lda <- as.data.frame(predict(my_lda, team_train))
# test_lda <- as.data.frame(predict(my_lda, team_test))

# Train model on LDA-preprocessed data: mdl_lda
# mdl_lda <- train(class ~ ., data = train_lda, method="svmLinear2", trControl = trainCtrl)

# Predict on LDA-ed test data: lda_preds, lda_probs
# lda_preds <- predict(mdl_lda, test_lda)
# lda_probs <- predict(mdl_lda, test_lda, type="prob")

# Compute and print confusion matrix & AUC: cm_lda, auc_lda
# (cm_lda <- confusionMatrix(lda_preds, test_lda$class))
# (auc_lda <- auc(test_lda$class == 'Real.Madrid', lda_probs$Real.Madrid))

Chapter 4 - Model Evaluation

Model Evaluation:

  • Several aspects should be considered when evaluating an ML model - classification, regression, and clustering have different techniques
    • Make sure that the evaluation is realistic through appropriate holdout (test-validate-train)
    • Cross-validation can be a useful technique for estimating OOB and OOS errors
  • Confusion matrices and ROC/AUC are commonly used for assessing classification algorithms
    • May want to modify accuracy to a cost-sensitive accuracy (errors with different penalties) or work with balanced classes
  • Regression models are often evaluated using RMSE
  • Clustering is often evaluate based on some mix of WSS and BSS - compactness and good separation

Handling Imbalanced Data:

  • Significantly imbalanced data can lead to algorithmic and reporting (especially accuracy) problems for classification data
  • Two popular avenues for dealing with imbalanced data - cost-sensitive classification, or sub-sampling
    • In cost-sensitive classification, the goal is to minimize the cost of classification errors
    • In subsampling training data, can downsample majority; upsample minority; and SMOTE (synthetic minority oversampling technique)
  • Can subsample before model evaluation, though this can lead to over-optimistic assessments of the model
    • Subsamping during model training (e.g., bootstraps) is computationally more expensive but also more likely to give realistic assessments of model performance

Hyperparameter Tuning:

  • Model parameters are learned during the training process while hyperparameters are provided as inputs to the model
    • Tuning hyperparameters can be seen as part (often iterative) of a meta-learning process
  • Three main hyperparameter tuning strategies include grid search, random search, and informed search
    • Grid search is an exhaustive search over all possible combinations of hyperparameters - very computationally expensive but also easy to parallelize
    • Random search involves sampling from possible combination of hyperparameters - easy to parallelize and can outperform grid search on continuous variables
    • Informed search methods like Bayesian optimization pick parameters based on performance - samples more aggressively around more promising values
  • Several R packages can help with hyperparameter tuning - caret, mlr, h2o

Random Forests or Gradient Boosted Trees:

  • Random Forests and Gradient Boosted Trees are both popular and successful models
    • Both are top performers for classification or regression
    • Both use decision trees as base learners and can handle missing values and require a number of trees to be pre-specified
    • Random forests use bagging on deep trees (minimize bias) and ensembling to reduce variance; grown in parallel and easier to tune
    • Gradient boosted trees use boosting on shallow trees (minimize variance) and ensembling to reduce bias; grown sequentially, with trees added only as needed
  • Random forests can be run using several packages including randomForest and ranger, or inside caret
    • tunedModel <- randomForest::tuneRF(x=predictors, y=response, nTreeTry=500) # tunes mtry
    • tunedModel <- caret::train(x=predictors, y=response, method=“rf”) # tunes mtry by default, and can specify others for tuning
  • Gradient boosted trees can be run using several packages including gbm and xgboost
    • opt_ntree_cv <- gbm::gbm.perf(model, method=“cv”)
    • opt_ntree_oob <- gbm::gbm.perf(model, method=“OOB”)
    • model <- caret::train(x=predictors, y=response, method=“xgbLinear”)

Wrap Up:

  • Data preprocessing and visualization - scaling, missing data, imputation, anomaly detection
  • Supervised learning - model interpretability, regularization, bias-variance trade-offs, ensembling
  • Unsupervised learning - clustering (k-means, hierarchical, RAM), feature selection, NZV, feature extraction
  • Model selection and evaluation - classification, regression, clustering, class imbalances, hyperparameter tuning, RF vs. GBM

Example code includes:

apps <- appsOld %>%
    select(Category, Rating, Reviews, Size, Installs, `Content Rating`) %>%
    rename(Content.Rating=`Content Rating`) %>%
    filter(complete.cases(.), Category %in% c("EDUCATION", "ENTERTAINMENT"), 
           Size!="Varies with device"
           ) %>%
    mutate(Category=factor(Category), Installs=factor(Installs), Content.Rating=factor(Content.Rating))

appSize <- rep(NA, nrow(apps))
mbSize <- grep("^[0-9][0-9\\.]*M", apps$Size)
kbSize <- grep("^[0-9][0-9\\.]*k", apps$Size)
appSize[mbSize] <- as.numeric(gsub('M', '', apps$Size[mbSize]))
appSize[kbSize] <- as.numeric(gsub('k', '', apps$Size[kbSize])) / 1000

apps$Size <- appSize
glimpse(apps)
## Observations: 200
## Variables: 6
## $ Category       <fct> EDUCATION, EDUCATION, EDUCATION, EDUCATION, EDUCATIO...
## $ Rating         <dbl> 4.6, 4.7, 4.6, 4.7, 4.5, 4.7, 4.8, 4.6, 4.6, 4.6, 4....
## $ Reviews        <dbl> 181893, 2544, 85375, 314299, 9770, 32346, 4075, 1061...
## $ Size           <dbl> 18.0, 18.0, 21.0, 3.3, 39.0, 3.2, 5.1, 11.0, 27.0, 3...
## $ Installs       <fct> "10,000,000+", "100,000+", "5,000,000+", "10,000,000...
## $ Content.Rating <fct> Everyone 10+, Everyone, Everyone, Everyone, Everyone...
set.seed(1912261548)
trIndex <- sort(sample(1:nrow(apps), round(0.75*nrow(apps)), replace=FALSE))
training <- apps[trIndex, ]
testing <- apps[-trIndex, ]


cv10 <- caret::trainControl(method="cv", number=10, classProbs=TRUE, 
                            summaryFunction=caret::twoClassSummary
                            )

# Create KNN model: mdlKNN
set.seed(123)
mdlKNN <- train(Category ~ ., data = training, method = "knn", trControl = cv10, metric="ROC")

# Print the KNN model and its confusion matrix
print(mdlKNN)
## k-Nearest Neighbors 
## 
## 150 samples
##   5 predictor
##   2 classes: 'EDUCATION', 'ENTERTAINMENT' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 134, 135, 135, 135, 135, 135, ... 
## Resampling results across tuning parameters:
## 
##   k  ROC        Sens       Spec     
##   5  0.6361111  0.7277778  0.5119048
##   7  0.6932044  0.7513889  0.6095238
##   9  0.6851190  0.7388889  0.5666667
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was k = 7.
ModelMetrics::confusionMatrix(predict(mdlKNN, testing), testing$Category)
##      [,1] [,2]
## [1,]    0    0
## [2,]    0   28
# Predict class labels and probs: knn_preds, knn_probs
knn_preds <- predict(mdlKNN, newdata = testing)
knn_probs <- predict(mdlKNN, newdata = testing, type="prob")

# Print accuracy and AUC values
print(Metrics::accuracy(testing$Category, knn_preds))
## [1] 0.58
print(Metrics::auc(testing$Category == 'ENTERTAINMENT', knn_probs[, 2]))
## [1] 0.6067323
# Train SVM: mdlSVM
# set.seed(123)
# mdlSVM <- train(Overall ~ ., data = training, method = "svmRadial", trControl = cv10)

# Print the SVM model
# print(mdlSVM)

# Predict overall score on testing data: svm_preds
# svm_preds <- predict(mdlSVM, newdata = testing)

# Print RMSE and MAE values
# print(rmse(testing$Overall, svm_preds))
# print(mae(testing$Overall, svm_preds))


# Glimpse at the data
glimpse(mall_scaled)
##  num [1:200, 1:3] -1.421 -1.278 -1.349 -1.135 -0.562 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:3] "Age" "AnnualIncome" "SpendingScore"
##  - attr(*, "scaled:center")= Named num [1:3] 38.9 60560 50.2
##   ..- attr(*, "names")= chr [1:3] "Age" "AnnualIncome" "SpendingScore"
##  - attr(*, "scaled:scale")= Named num [1:3] 14 26264.7 25.8
##   ..- attr(*, "names")= chr [1:3] "Age" "AnnualIncome" "SpendingScore"
# Run DIANA: results
results <- clValid::clValid(mall_scaled, 2:10, clMethods = "diana", validation = "internal")
## Warning in clValid::clValid(mall_scaled, 2:10, clMethods = "diana", validation =
## "internal"): rownames for data not specified, using 1:nrow(data)
# Print and summarize results
print(results)
## 
## Call:
## clValid::clValid(obj = mall_scaled, nClust = 2:10, clMethods = "diana", 
##     validation = "internal")
## 
## Clustering Methods:
##  diana 
## 
## Cluster sizes:
##  2 3 4 5 6 7 8 9 10 
## 
## Validation measures:
##  Connectivity Dunn Silhouette
summary(results)
## 
## Clustering Methods:
##  diana 
## 
## Cluster sizes:
##  2 3 4 5 6 7 8 9 10 
## 
## Validation Measures:
##                           2       3       4       5       6       7       8       9      10
##                                                                                            
## diana Connectivity  15.2317 22.5933 24.7599 29.3944 35.5631 42.6484 54.9845 62.9528 73.1131
##       Dunn           0.0596  0.0275  0.0320  0.0401  0.0408  0.0434  0.0467  0.0483  0.0563
##       Silhouette     0.3355  0.3524  0.3971  0.4161  0.4258  0.4188  0.3992  0.4012  0.3869
## 
## Optimal Scores:
## 
##              Score   Method Clusters
## Connectivity 15.2317 diana  2       
## Dunn          0.0596 diana  2       
## Silhouette    0.4258 diana  6
# Plot results
plot(results)

# Glimpse at the data
# glimpse(pulsar)

# Is there a class imbalance?
# table(pulsar$target_class)

# Set seed and partition data
# set.seed(123)
# inTrain <- createDataPartition(y = pulsar$target_class, p = .75, list = FALSE)
# training <- pulsar[inTrain,]
# testing <- pulsar[-inTrain,]

# Is there class imbalance in the training and test sets?
# table(training$target_class)
# table(testing$target_class)


trainDTree <- function(train_data, samplingMode = NULL) {
    set.seed(123)
    ctrl <- trainControl(method = "cv", number = 10, classProbs = TRUE,
                         summaryFunction = twoClassSummary, sampling = samplingMode
                         )
    train(target_class ~ ., data = train_data, method = "rpart", metric = "ROC", trControl = ctrl)
}

# Train and print model with no subsampling: mdl_orig
# (mdl_orig <- trainDTree(training))

# Train model with downsampling: mdl_down
# (mdl_down <- trainDTree(training, samplingMode = "down"))

# Train model with upsampling: mdl_up
# (mdl_up <- trainDTree(training, samplingMode = "up"))

# Train model with SMOTE: mdl_smote
# (mdl_smote <- trainDTree(training, samplingMode = "smote"))


get_auc <- function(model, data) {
    library(Metrics)
    preds <- predict(model, data, type = "prob")[, "yes"]
    auc(data$target_class == "yes", preds)
}

# Create model list: mdl_list
# mdl_list <- list(orig = mdl_orig, down = mdl_down, up = mdl_up, smote = mdl_smote)

# Compute AUC on training subsamples: resampling
# resampling <- resamples(mdl_list)
# summary(resampling, metric="ROC")

# Compute AUC on test data: auc_values
# auc_values <- sapply(mdl_list, FUN=get_auc, data = testing)
# print(auc_values)


set.seed(1912261602)
carIdx <- sort(sample(1:nrow(car), round(0.75*nrow(car)), replace=FALSE))
car_train <- car[carIdx, ]
car_test <- car[-carIdx, ]


# Set up train control: trc
trc <- caret::trainControl(method = "repeatedcv", number = 3, repeats = 5)

# Train model: svmr
svmr <- caret::train(consume ~ ., data = car_train, method = "svmRadial", trControl = trc)

# Print and plot SVM model
print(svmr)
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 291 samples
##   5 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (3 fold, repeated 5 times) 
## Summary of sample sizes: 194, 193, 195, 193, 194, 195, ... 
## Resampling results across tuning parameters:
## 
##   C     RMSE      Rsquared    MAE     
##   0.25  16.60917  0.07026194  9.817553
##   0.50  16.56663  0.07598980  9.836894
##   1.00  16.69086  0.07013250  9.949751
## 
## Tuning parameter 'sigma' was held constant at a value of 0.333598
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.333598 and C = 0.5.
plot(svmr)

# Set up train control: trc
trc <- caret::trainControl(method = "cv", number = 10)

# Create custom hyperparameter grid: hp_grid
hp_grid <- expand.grid(C = seq(from=0.2, to=1.0, by=0.2), sigma = c(0.35, 0.6, 0.75))

# Train model: svmr
svmr <- caret::train(consume ~ ., data = car_train, method = "svmRadial", trControl = trc, tuneGrid = hp_grid)

# Print and plot SVM model
print(svmr)
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 291 samples
##   5 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 261, 261, 263, 262, 262, 263, ... 
## Resampling results across tuning parameters:
## 
##   C    sigma  RMSE      Rsquared   MAE     
##   0.2  0.35   16.26445  0.1133446  9.805302
##   0.2  0.60   16.29272  0.1095463  9.803260
##   0.2  0.75   16.27141  0.1114307  9.776178
##   0.4  0.35   16.28267  0.1171166  9.834590
##   0.4  0.60   16.26987  0.1189881  9.818069
##   0.4  0.75   16.25134  0.1215314  9.797508
##   0.6  0.35   16.28707  0.1206414  9.842559
##   0.6  0.60   16.27261  0.1222429  9.864889
##   0.6  0.75   16.29126  0.1183495  9.855399
##   0.8  0.35   16.32585  0.1222056  9.876548
##   0.8  0.60   16.32531  0.1188580  9.907451
##   0.8  0.75   16.36181  0.1113415  9.907735
##   1.0  0.35   16.37773  0.1204052  9.913318
##   1.0  0.60   16.42585  0.1111526  9.966089
##   1.0  0.75   16.44576  0.1042592  9.990756
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.75 and C = 0.4.
plot(svmr)

# Set random seed
set.seed(42)

# Set up train control: trc
trc <- caret::trainControl(method = "cv", number = 10, search = "random")

# Train model: svmr
svmr <- caret::train(consume ~ ., data = car_train, method = "svmRadial", trControl = trc, tuneLength = 10)

# Print and plot SVM model
print(svmr)
## Support Vector Machines with Radial Basis Function Kernel 
## 
## 291 samples
##   5 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 263, 262, 262, 262, 261, 262, ... 
## Resampling results across tuning parameters:
## 
##   sigma       C             RMSE      Rsquared    MAE      
##   0.01863605  139.97515122  16.96691  0.09870029  10.298489
##   0.01913389    0.04360606  16.67347  0.08400858  10.220681
##   0.03218316  182.01960806  17.35699  0.08326584  10.708358
##   0.04128325  447.99451171  18.81305  0.08272349  11.582198
##   0.05397889    0.11703741  16.49487  0.09765690   9.931373
##   0.06597003   40.93533689  17.42050  0.07540061  10.627318
##   0.07371349    4.52222667  16.56755  0.11544558   9.879941
##   0.23347673   26.75634732  18.13670  0.09849218  11.209528
##   0.35231334    0.49924112  16.44355  0.10131733   9.707891
##   1.30041593  511.66463964  27.86139  0.03973164  18.933067
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were sigma = 0.3523133 and C = 0.4992411.
plot(svmr)

# Train the RF model: mdlRF
mdlRF <- randomForest::randomForest(formula = Rating ~ ., data = training, ntree = 500)

# Print the RF model
print(mdlRF)
## 
## Call:
##  randomForest(formula = Rating ~ ., data = training, ntree = 500) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 0.05913992
##                     % Var explained: 26.65
# RF variable importance
randomForest::varImpPlot(mdlRF)

print(mdlRF$importance)
##                IncNodePurity
## Category           0.9254966
## Reviews            1.9137065
## Size               1.4592641
## Installs           1.4008747
## Content.Rating     0.9916575
# Train a GBM model with 500 trees: mdlGBM
mdlGBM <- gbm::gbm(formula = Rating ~ ., data = training, n.trees = 500)
## Distribution not specified, assuming gaussian ...
# Print GBM model
print(mdlGBM)
## gbm::gbm(formula = Rating ~ ., data = training, n.trees = 500)
## A gradient boosted model with gaussian loss function.
## 500 iterations were performed.
## There were 5 predictors of which 5 had non-zero influence.
# Summarize GBM's variable importance
summary(mdlGBM)

##                           var   rel.inf
## Installs             Installs 36.908130
## Reviews               Reviews 35.318460
## Size                     Size 16.759659
## Content.Rating Content.Rating  5.577588
## Category             Category  5.436162
# Predict on the testing data: gbm_preds, rf_preds
gbm_preds <- predict(mdlGBM, n.trees = 500, newdata = testing)
rf_preds <- predict(mdlRF, newdata = testing)

# RMSE metric for both models: gbm_rmse, rf_rmse
(gbm_rmse <- Metrics::rmse(testing$Rating, gbm_preds))
## [1] 0.2979939
(rf_rmse <- Metrics::rmse(testing$Rating, rf_preds))
## [1] 0.2866548
# RRSE metric for both models: gbm_rrse, rf_rrse
(gbm_rrse <- Metrics::rrse(testing$Rating, gbm_preds))
## [1] 0.9146599
(rf_rrse <- Metrics::rrse(testing$Rating, rf_preds))
## [1] 0.8798557

Introduction to Natural Language Processing in R

Chapter 1 - True Fundamentals

Regular Expression Basics:

  • NLP is natural language processing which focuses on using computers to understand and analyze text
    • Classifiers, Topic Modeling, Named Entity Recognition, Sentiment Analysis, etc.
  • Regular expressions are a sequence of characters used to search text
    • is any alphanumeric
    • The plus(+) means 1 or more
    • is any whitespace
    • is any non-whitespace (capitalizing means negation, so capital S is not whitespace)
  • R takes regular expressions in grep() and gsub()
    • grep(pattern, x, value=FALSE) will find matches to pattern in vector x
    • gsub(pattern, replacement, x) wll replace pattern with replacement in vector x
  • Can use regexone.com for learning more about regular expressions

Tokenization:

  • Tokens can be as small as individual characters or as large as the entire text document
  • The tidytext package follows the tidy format
  • This course will use several datasets, including chapters from the book animal_farm
    • animal_farm %>% unnest_tokens(output=“word”, input=text_column, token=“words”) # token can be ‘sentences’, ‘lines’, and many others
    • animal_farm %>% filter(chapter==“Chapter 1”) %>% unnest_tokens(output=“Boxer”, input=text_column, token=“regex”, pattern=“(?i)boxer”) %>% slice(2:n())

Text Cleaning Basics:

  • Russian tweet dataset is available from fiethirtyeight.com
    • russian_tweets %>% unnest_tokens(word, content) %>% count(word, sort=TRUE)
    • russian_tweets %>% unnest_tokens(word, content) %>% anti_join(stop_words) %>% count(word, sort=TRUE)
  • Can add custom stop words to the existing stop words list
    • custom <- add_row(stop_words, word=“https”, lexicon=“custom”)
    • custom <- add_row(custom, word=“t.co”, lexicon=“custom”)
  • Stemming is the process of converting words to their roots - for example, enlisted has a stem of enlist
    • tidy_tweets <- russian_tweets %>% unnest_tokens(word, content) %>% anti_join(custom)
    • stemmed_tweets <- tidy_tweets %>% mutate(word=SnowballC::wordStem(word))

Example code includes:

text <- c("John's favorite color two colors are blue and red.", "John's favorite number is 1111.", 'John lives at P Sherman, 42 Wallaby Way, Sydney', 'He is 7 feet tall', 'John has visited 30 countries', 'John only has nine fingers.', 'John has worked at eleven different jobs', 'He can speak 3 languages', "john's favorite food is pizza", 'John can name 10 facts about himself.')

# Print off each item that contained a numeric number
grep(pattern = "\\d", x = text, value = TRUE)
## [1] "John's favorite number is 1111."                
## [2] "John lives at P Sherman, 42 Wallaby Way, Sydney"
## [3] "He is 7 feet tall"                              
## [4] "John has visited 30 countries"                  
## [5] "He can speak 3 languages"                       
## [6] "John can name 10 facts about himself."
# Find all items with a number followed by a space
grep(pattern = "\\d\\s", x = text)
## [1]  3  4  5  8 10
# How many times did you write down 'favorite'?
length(grep(pattern = "favorite", x = text))
## [1] 3
# Print off the text for every time you used your boss's name, John
grep('John', x = text, value = TRUE)
## [1] "John's favorite color two colors are blue and red."
## [2] "John's favorite number is 1111."                   
## [3] "John lives at P Sherman, 42 Wallaby Way, Sydney"   
## [4] "John has visited 30 countries"                     
## [5] "John only has nine fingers."                       
## [6] "John has worked at eleven different jobs"          
## [7] "John can name 10 facts about himself."
# Try replacing all occurences of "John" with "He"
gsub(pattern = 'John', replacement = 'He ', x = text)
##  [1] "He 's favorite color two colors are blue and red."
##  [2] "He 's favorite number is 1111."                   
##  [3] "He  lives at P Sherman, 42 Wallaby Way, Sydney"   
##  [4] "He is 7 feet tall"                                
##  [5] "He  has visited 30 countries"                     
##  [6] "He  only has nine fingers."                       
##  [7] "He  has worked at eleven different jobs"          
##  [8] "He can speak 3 languages"                         
##  [9] "john's favorite food is pizza"                    
## [10] "He  can name 10 facts about himself."
# Replace all occurences of "John " with 'He '.
clean_text <- gsub(pattern = 'John\\s', replacement = 'He ', x = text)
clean_text
##  [1] "John's favorite color two colors are blue and red."
##  [2] "John's favorite number is 1111."                   
##  [3] "He lives at P Sherman, 42 Wallaby Way, Sydney"     
##  [4] "He is 7 feet tall"                                 
##  [5] "He has visited 30 countries"                       
##  [6] "He only has nine fingers."                         
##  [7] "He has worked at eleven different jobs"            
##  [8] "He can speak 3 languages"                          
##  [9] "john's favorite food is pizza"                     
## [10] "He can name 10 facts about himself."
# Replace all occurences of "John's" with 'His'
gsub(pattern = "John\\'s", replacement = 'His', x = clean_text)
##  [1] "His favorite color two colors are blue and red."
##  [2] "His favorite number is 1111."                   
##  [3] "He lives at P Sherman, 42 Wallaby Way, Sydney"  
##  [4] "He is 7 feet tall"                              
##  [5] "He has visited 30 countries"                    
##  [6] "He only has nine fingers."                      
##  [7] "He has worked at eleven different jobs"         
##  [8] "He can speak 3 languages"                       
##  [9] "john's favorite food is pizza"                  
## [10] "He can name 10 facts about himself."
animal_farm <- read_csv("./RInputFiles/animal_farm.csv")
## Parsed with column specification:
## cols(
##   chapter = col_character(),
##   text_column = col_character()
## )
str(animal_farm)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 10 obs. of  2 variables:
##  $ chapter    : chr  "Chapter 1" "Chapter 2" "Chapter 3" "Chapter 4" ...
##  $ text_column: chr  "Mr. Jones, of the Manor Farm, had locked the hen-houses for the night, but was too drunk to remember to shut th"| __truncated__ "Three nights later old Major died peacefully in his sleep. His body was buried at the foot of the orchard.This "| __truncated__ "How they toiled and sweated to get the hay in! But their efforts were rewarded, for the harvest was an even big"| __truncated__ "By the late summer the news of what had happened on Animal Farm had spread across half the county. Every day Sn"| __truncated__ ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   chapter = col_character(),
##   ..   text_column = col_character()
##   .. )
# Split the text_column into sentences
animal_farm %>%
    tidytext::unnest_tokens(output = "sentences", input = text_column, token = "sentences") %>%
    # Count sentences, per chapter
    count(chapter)
## # A tibble: 10 x 2
##    chapter        n
##    <chr>      <int>
##  1 Chapter 1    136
##  2 Chapter 10   167
##  3 Chapter 2    140
##  4 Chapter 3    114
##  5 Chapter 4     84
##  6 Chapter 5    158
##  7 Chapter 6    136
##  8 Chapter 7    190
##  9 Chapter 8    203
## 10 Chapter 9    195
# Split the text_column using regular expressions
animal_farm %>%
    tidytext::unnest_tokens(output = "sentences", input = text_column, token = "regex", pattern = "\\.") %>%
    count(chapter)
## # A tibble: 10 x 2
##    chapter        n
##    <chr>      <int>
##  1 Chapter 1    131
##  2 Chapter 10   179
##  3 Chapter 2    150
##  4 Chapter 3    113
##  5 Chapter 4     92
##  6 Chapter 5    158
##  7 Chapter 6    127
##  8 Chapter 7    188
##  9 Chapter 8    200
## 10 Chapter 9    174
# Tokenize animal farm's text_column column
tidy_animal_farm <- animal_farm %>%
    tidytext::unnest_tokens(word, text_column) 

# Print the word frequencies
tidy_animal_farm %>%
    count(word, sort = TRUE)
## # A tibble: 4,076 x 2
##    word      n
##    <chr> <int>
##  1 the    2187
##  2 and     966
##  3 of      899
##  4 to      814
##  5 was     633
##  6 a       620
##  7 in      537
##  8 had     529
##  9 that    451
## 10 it      384
## # ... with 4,066 more rows
# Remove stop words, using stop_words from tidytext
str(tidy_animal_farm)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 30037 obs. of  2 variables:
##  $ chapter: chr  "Chapter 1" "Chapter 1" "Chapter 1" "Chapter 1" ...
##  $ word   : chr  "mr" "jones" "of" "the" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   chapter = col_character(),
##   ..   text_column = col_character()
##   .. )
tidy_animal_farm <- tidy_animal_farm %>%
    anti_join(tidytext::stop_words)
## Joining, by = "word"
str(tidy_animal_farm)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 10579 obs. of  2 variables:
##  $ chapter: chr  "Chapter 1" "Chapter 1" "Chapter 1" "Chapter 1" ...
##  $ word   : chr  "jones" "manor" "farm" "locked" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   chapter = col_character(),
##   ..   text_column = col_character()
##   .. )
# Perform stemming on tidy_animal_farm
stemmed_animal_farm <- tidy_animal_farm %>%
    mutate(word = SnowballC::wordStem(word))

# Print the old word frequencies 
tidy_animal_farm %>%
    count(word, sort = TRUE)
## # A tibble: 3,611 x 2
##    word         n
##    <chr>    <int>
##  1 animals    248
##  2 farm       163
##  3 napoleon   141
##  4 animal     107
##  5 snowball   106
##  6 pigs        91
##  7 boxer       76
##  8 time        71
##  9 windmill    68
## 10 squealer    61
## # ... with 3,601 more rows
# Print the new word frequencies
stemmed_animal_farm %>%
    count(word, sort = TRUE)
## # A tibble: 2,751 x 2
##    word         n
##    <chr>    <int>
##  1 anim       363
##  2 farm       173
##  3 napoleon   141
##  4 pig        114
##  5 snowbal    106
##  6 comrad      94
##  7 dai         86
##  8 time        83
##  9 boxer       76
## 10 windmil     70
## # ... with 2,741 more rows

Chapter 2 - Representations of Text

Understanding an R Corpus:

  • Corpora are collections of documents containing natural language text
    • The most common representation is the Vcorpus, which holds the text and the metadata
    • The corpus ‘acq’ is available in library™
    • acq[[1]]$meta # first article metadata
    • acq[[1]]\(meta\)places # places values of first article metadata
    • acq[[1]]$content # first article content
  • Can tidy a corpus so that each observation is represented by a row
    • tidy_data <- tidy(acq)
    • corpus <- VCorpus(VectorSource(tidy_data$text)) # convert from tibble to vcorpus, which will convert over only the text
    • meta(corpus, “Author”) <- tidy_data$author # add author
    • meta(corpus, “oldid”) <- tidy_data$oldid # add oldid

Bag-of-words Representation:

  • Typical vector representations include all words, converted to lowercase, that are in the document
    • A count by document of each word is then produced (can be boolean or count of word in document)
  • Sparse matrices are essential for text analysis - 20k x 40k matrix may have only a few thousand non-zero entries (well under 1%)

TFIDF - Term Frequency Inverse Document Frequency:

  • The TFIDF considers both term frequency in a specifc text and inverse of term frequency in the overall corpus
    • IDF = log(N / n-with-word) # many other possibilities for calculating IDF, though this is a very common one; calculated by word
    • TF = N / n-in-document # calculated by document-word
  • Example for creation of a TFIDF matrix
    • df %>% unnest_tokens(output=“word”, token=“words”, input=text) %>% anti_join(stop_words) %>% count(ID, word, sort=TRUE) %>% bind_tf_idf(word, ID, n)
    • Creates columns tf, idf, and tf_idf (multiplication of tf*idf)

Cosine Similarity:

  • Can use TFIDF to assess article similarities using cosine similarity (angle formed by two vectors in n-space)
    • A-dot-B/(mag(A)*mag(B))
    • pairwise_similarity(tbl, item, feature, value, …) # item to be compared, feature that links the items, value of the feature
    • crude_weights %>% pairwise_similarity(X, word, tf_idf) %>% arrange(desc(similarity)) # will range between 0 (nothing in common) and 1 (identical)

Example code includes:

crudeText <- c('Diamond Shamrock Corp said that\neffective today it had cut its contract prices for crude oil by\n1.50 dlrs a barrel.\n    The reduction brings its posted price for West Texas\nIntermediate to 16.00 dlrs a barrel, the copany said.\n    \"The price reduction today was made in the light of falling\noil product prices and a weak crude oil market,\" a company\nspokeswoman said.\n    Diamond is the latest in a line of U.S. oil companies that\nhave cut its contract, or posted, prices over the last two days\nciting weak oil markets.\n Reuter')
crudeText <- c(crudeText, 'OPEC may be forced to meet before a\nscheduled June session to readdress its production cutting\nagreement if the organization wants to halt the current slide\nin oil prices, oil industry analysts said.\n    \"The movement to higher oil prices was never to be as easy\nas OPEC thought. They may need an emergency meeting to sort out\nthe problems,\" said Daniel Yergin, director of Cambridge Energy\nResearch Associates, CERA.\n    Analysts and oil industry sources said the problem OPEC\nfaces is excess oil supply in world oil markets.\n    \"OPECs problem is not a price problem but a production\nissue and must be addressed in that way,\" said Paul Mlotok, oil\nanalyst with Salomon Brothers Inc.\n    He said the markets earlier optimism about OPEC and its\nability to keep production under control have given way to a\npessimistic outlook that the organization must address soon if\nit wishes to regain the initiative in oil prices.\n    But some other analysts were uncertain that even an\nemergency meeting would address the problem of OPEC production\nabove the 15.8 mln bpd quota set last December.\n    \"OPEC has to learn that in a buyers market you cannot have\ndeemed quotas, fixed prices and set differentials,\" said the\nregional manager for one of the major oil companies who spoke\non condition that he not be named. \"The market is now trying to\nteach them that lesson again,\" he added.\n    David T. Mizrahi, editor of Mideast reports, expects OPEC\nto meet before June, although not immediately. However, he is\nnot optimistic that OPEC can address its principal problems.\n    \"They will not meet now as they try to take advantage of the\nwinter demand to sell their oil, but in late March and April\nwhen demand slackens,\" Mizrahi said.\n    But Mizrahi said that OPEC is unlikely to do anything more\nthan reiterate its agreement to keep output at 15.8 mln bpd.\"\n    Analysts said that the next two months will be critical for\nOPECs ability to hold together prices and output.\n    \"OPEC must hold to its pact for the next six to eight weeks\nsince buyers will come back into the market then,\" said Dillard\nSpriggs of Petroleum Analysis Ltd in New York.\n    But Bijan Moussavar-Rahmani of Harvard Universitys Energy\nand Environment Policy Center said that the demand for OPEC oil\nhas been rising through the first quarter and this may have\nprompted excesses in its production.\n    \"Demand for their (OPEC) oil is clearly above 15.8 mln bpd\nand is probably closer to 17 mln bpd or higher now so what we\nare seeing characterized as cheating is OPEC meeting this\ndemand through current production,\" he told Reuters in a\ntelephone interview.\n Reuter')
crudeText <- c(crudeText, 'Texaco Canada said it lowered the\ncontract price it will pay for crude oil 64 Canadian cts a\nbarrel, effective today.\n    The decrease brings the companys posted price for the\nbenchmark grade, Edmonton/Swann Hills Light Sweet, to 22.26\nCanadian dlrs a bbl.\n    Texaco Canada last changed its crude oil postings on Feb\n19.\n Reuter')
crudeText <- c(crudeText, 'Marathon Petroleum Co said it reduced\nthe contract price it will pay for all grades of crude oil one\ndlr a barrel, effective today.\n    The decrease brings Marathons posted price for both West\nTexas Intermediate and West Texas Sour to 16.50 dlrs a bbl. The\nSouth Louisiana Sweet grade of crude was reduced to 16.85 dlrs\na bbl.\n    The company last changed its crude postings on Jan 12.\n Reuter')
crudeText <- c(crudeText, 'Houston Oil Trust said that independent\npetroleum engineers completed an annual study that estimates\nthe trusts future net revenues from total proved reserves at\n88 mln dlrs and its discounted present value of the reserves at\n64 mln dlrs.\n    Based on the estimate, the trust said there may be no money\navailable for cash distributions to unitholders for the\nremainder of the year.\n    It said the estimates reflect a decrease of about 44 pct in\nnet reserve revenues and 39 pct in discounted present value\ncompared with the study made in 1985.\n Reuter')
crudeText <- c(crudeText, 'Kuwait\"s Oil Minister, in remarks\npublished today, said there were no plans for an emergency OPEC\nmeeting to review oil policies after recent weakness in world\noil prices.\n    Sheikh Ali al-Khalifa al-Sabah was quoted by the local\ndaily al-Qabas as saying: \"None of the OPEC members has asked\nfor such a meeting.\"\n    He denied Kuwait was pumping above its quota of 948,000\nbarrels of crude daily (bpd) set under self-imposed production\nlimits of the 13-nation organisation.\n    Traders and analysts in international oil markets estimate\nOPEC is producing up to one mln bpd above a ceiling of 15.8 mln\nbpd agreed in Geneva last December.\n    They named Kuwait and the United Arab Emirates, along with\nthe much smaller producer Ecuador, among those producing above\nquota. Kuwait, they said, was pumping 1.2 mln bpd.\n    \"This rumour is baseless. It is based on reports which said\nKuwait has the ability to exceed its share. They suppose that\nbecause Kuwait has the ability, it will do so,\" the minister\nsaid.\n    Sheikh Ali has said before that Kuwait had the ability to\nproduce up to 4.0 mln bpd.\n    \"If we can sell more than our quota at official prices,\nwhile some countries are suffering difficulties marketing their\nshare, it means we in Kuwait are unusually clever,\" he said.\n    He was referring apparently to the Gulf state of qatar,\nwhich industry sources said was selling less than 180,000 bpd\nof its 285,000 bpd quota, because buyers were resisting\nofficial prices restored by OPEC last month pegged to a marker\nof 18 dlrs per barrel.\n    Prices in New York last week dropped to their lowest levels\nthis year and almost three dollars below a three-month high of\n19 dollars a barrel.\n    Sheikh Ali also delivered \"a challenge to any international\noil company that declared Kuwait sold below official prices.\"\n    Because it was charging its official price, of 16.67 dlrs a\nbarrel, it had lost custom, he said but did not elaborate.\n    However, Kuwait had guaranteed markets for its oil because\nof its local and international refining facilities and its own\ndistribution network abroad, he added.\n    He reaffirmed that the planned meeting March 7 of OPEC\"s\ndifferentials committee has been postponed until the start of\nApril at the request of certain of the body\"s members.\n    Ecuador\"s deputy energy minister Fernando Santos Alvite said\nlast Wednesday his debt-burdened country wanted OPEC to assign\na lower official price for its crude, and was to seek this at\ntalks this month of opec\"s pricing committee.\n    Referring to pressure by oil companies on OPEC members, in\napparent reference to difficulties faced by Qatar, he said: \"We\nexpected such pressure. It will continue through March and\nApril.\" But he expected the situation would later improve.\n REUTER')
crudeText <- c(crudeText, 'Indonesia appears to be nearing a\npolitical crossroads over measures to deregulate its protected\neconomy, the U.S. Embassy says in a new report.\n    To counter falling oil revenues, the government has\nlaunched a series of measures over the past nine months to\nboost exports outside the oil sector and attract new\ninvestment.\n    Indonesia, the only Asian member of OPEC and a leading\nprimary commodity producer, has been severely hit by last year\"s\nfall in world oil prices, which forced it to devalue its\ncurrency by 31 pct in September.\n    But the U.S. Embassy report says President Suharto\"s\ngovernment appears to be divided over what direction to lead\nthe economy.\n    \"(It) appears to be nearing a crossroads with regard to\nderegulation, both as it pertains to investments and imports,\"\nthe report says. It primarily assesses Indonesia\"s agricultural\nsector, but also reviews the country\"s general economic\nperformance.\n    It says that while many government officials and advisers\nare recommending further relaxation, \"there are equally strong\npressures being exerted to halt all such moves.\"\n    \"This group strongly favours an import substitution economy,\"\nthe report says.\n    Indonesia\"s economic changes have been welcomed by the World\nBank and international bankers as steps in the right direction,\nthough they say crucial areas of the economy like plastics and\nsteel remain highly protected, and virtual monopolies.\n    Three sets of measures have been announced since last May,\nwhich broadened areas for foreign investment, reduced trade\nrestrictions and liberalised imports.\n    The report says Indonesia\"s economic growth in calendar 1986\nwas probably about zero, and the economy may even have\ncontracted a bit. \"This is the lowest rate of growth since the\nmid-1960s,\" the report notes.\n    Indonesia, the largest country in South-East Asia with a\npopulation of 168 million, is facing general elections in\nApril.\n    But the report hold out little hope for swift improvement\nin the economic outlook. \"For 1987 early indications point to a\nslightly positive growth rate not exceeding one pct. Economic\nactivity continues to suffer due to the sharp fall in export\nearnings from the petroleum industry.\"\n    \"Growth in the non-oil sector is low because of weak\ndomestic demand coupled with excessive plant capacity, real\ndeclines in construction and trade, and a reduced level of\ngrowth in agriculture,\" the report states.\n    Bankers say continuation of present economic reforms is\ncrucial for the government to get the international lending its\nneeds.\n    A new World Bank loan of 300 mln dlrs last month in balance\nof payments support was given partly to help the government\nmaintain the momentum of reform, the Bank said.\n REUTER')
crudeText <- c(crudeText, 'Saudi riyal interbank deposits were\nsteady at yesterdays higher levels in a quiet market.\n    Traders said they were reluctant to take out new positions\namidst uncertainty over whether OPEC will succeed in halting\nthe current decline in oil prices.\n    Oil industry sources said yesterday several Gulf Arab\nproducers had had difficulty selling oil at official OPEC\nprices but Kuwait has said there are no plans for an emergency\nmeeting of the 13-member organisation.\n    A traditional Sunday lull in trading due to the European\nweekend also contributed to the lack of market activity.\n    Spot-next and one-week rates were put at 6-1/4, 5-3/4 pct\nafter quotes ranging between seven, six yesterday.\n    One, three, and six-month deposits were quoted unchanged at\n6-5/8, 3/8, 7-1/8, 6-7/8 and 7-3/8, 1/8 pct respectively.\n    The spot riyal was quietly firmer at 3.7495/98 to the\ndollar after quotes of 3.7500/03 yesterday.\n REUTER')
crudeText <- c(crudeText, 'The Gulf oil state of Qatar, recovering\nslightly from last years decline in world oil prices,\nannounced its first budget since early 1985 and projected a\ndeficit of 5.472 billion riyals.\n    The deficit compared with a shortfall of 7.3 billion riyals\nin the last published budget for 1985/86.\n    In a statement outlining the budget for the fiscal year\n1987/88 beginning today, Finance and Petroleum Minister Sheikh\nAbdul-Aziz bin Khalifa al-Thani said the government expected to\nspend 12.217 billion riyals in the period.\n    Projected expenditure in the 1985/86 budget had been 15.6\nbillion riyals.\n    Sheikh Abdul-Aziz said government revenue would be about\n6.745 billion riyals, down by about 30 pct on the 1985/86\nprojected revenue of 9.7 billion.\n    The government failed to publish a 1986/87 budget due to\nuncertainty surrounding oil revenues.\n    Sheikh Abdul-Aziz said that during that year the government\ndecided to limit recurrent expenditure each month to\none-twelfth of the previous fiscal years allocations minus 15\npct.\n    He urged heads of government departments and public\ninstitutions to help the government rationalise expenditure. He\ndid not say how the 1987/88 budget shortfall would be covered.\n    Sheikh Abdul-Aziz said plans to limit expenditure in\n1986/87 had been taken in order to relieve the burden placed on\nthe countrys foreign reserves.\n    He added in 1987/88 some 2.766 billion riyals had been\nallocated for major projects including housing and public\nbuildings, social services, health, education, transport and\ncommunications, electricity and water, industry and\nagriculture.\n    No figure was revealed for expenditure on defence and\nsecurity. There was also no projection for oil revenue.\n    Qatar, an OPEC member, has an output ceiling of 285,000\nbarrels per day.\n    Sheikh Abdul-Aziz said: \"Our expectations of positive signs\nregarding (oil) price trends, foremost among them OPECs\ndetermination to shoulder its responsibilites and protect its\nwealth, have helped us make reasonable estimates for the coming\nyears revenue on the basis of our assigned quota.\"\n REUTER')
crudeText <- c(crudeText, 'Saudi Arabian Oil Minister Hisham Nazer\nreiterated the kingdoms commitment to last Decembers OPEC\naccord to boost world oil prices and stabilise the market, the\nofficial Saudi Press Agency SPA said.\n    Asked by the agency about the recent fall in free market\noil prices, Nazer said Saudi Arabia \"is fully adhering by the\n... Accord and it will never sell its oil at prices below the\npronounced prices under any circumstance.\"\n    Nazer, quoted by SPA, said recent pressure on free market\nprices \"may be because of the end of the (northern hemisphere)\nwinter season and the glut in the market.\"\n    Saudi Arabia was a main architect of the December accord,\nunder which OPEC agreed to lower its total output ceiling by\n7.25 pct to 15.8 mln barrels per day (bpd) and return to fixed\nprices of around 18 dlrs a barrel.\n    The agreement followed a year of turmoil on oil markets,\nwhich saw prices slump briefly to under 10 dlrs a barrel in\nmid-1986 from about 30 dlrs in late 1985. Free market prices\nare currently just over 16 dlrs.\n    Nazer was quoted by the SPA as saying Saudi Arabias\nadherence to the accord was shown clearly in the oil market.\n    He said contacts among members of OPEC showed they all\nwanted to stick to the accord.\n    In Jamaica, OPEC President Rilwanu Lukman, who is also\nNigerian Oil Minister, said the group planned to stick with the\npricing agreement.\n    \"We are aware of the negative forces trying to manipulate\nthe operations of the market, but we are satisfied that the\nfundamentals exist for stable market conditions,\" he said.\n    Kuwaits Oil Minister, Sheikh Ali al-Khalifa al-Sabah, said\nin remarks published in the emirates daily Al-Qabas there were\nno plans for an emergency OPEC meeting to review prices.\n    Traders and analysts in international oil markets estimate\nOPEC is producing up to one mln bpd above the 15.8 mln ceiling.\n    They named Kuwait and the United Arab Emirates, along with\nthe much smaller producer Ecuador, among those producing above\nquota. Sheikh Ali denied that Kuwait was over-producing.\n REUTER')
crudeText <- c(crudeText, 'Saudi crude oil output last month fell\nto an average of 3.5 mln barrels per day (bpd) from 3.8 mln bpd\nin January, Gulf oil sources said.\n    They said exports from the Ras Tanurah and Juaymah\nterminals in the Gulf fell to an average 1.9 mln bpd last month\nfrom 2.2 mln in January because of lower liftings by some\ncustomers.\n    But the drop was much smaller than expected after Gulf\nexports rallied in the fourth week of February to 2.5 mln bpd\nfrom 1.2 mln in the third week, the sources said.\n    The production figures include neutral zone output but not\nsales from floating storage, which are generally considered\npart of a countrys output for Opec purposes.\n    Saudi Arabia has an Opec quota of 4.133 mln bpd under a\nproduction restraint scheme approved by the 13-nation group\nlast December to back new official oil prices averaging 18 dlrs\na barrel.\n    The sources said the two-fold jump in exports last week\nappeared to be the result of buyers rushing to lift February\nentitlements before the month-end.\n    Last weeks high export levels appeared to show continued\nsupport for official Opec prices from Saudi Arabias main crude\ncustomers, the four ex-partners of Aramco, the sources said.\n    The four -- Exxon Corp <XON>, Mobil Corp <MOB>, Texaco Inc\n<TX> and Chevron Corp <CHV> -- signed a long-term agreement\nlast month to buy Saudi crude for 17.52 dlrs a barrel.\n    However the sources said the real test of Saudi Arabias\nability to sell crude at official prices in a weak market will\ncome this month, when demand for petroleum products\ntraditionally tapers off. Spot prices have fallen in recent\nweeks to more than one dlr below Opec levels.\n    Saudi Arabian oil minister Hisham Nazer yesterday\nreiterated the kingdoms commitment to the December OPEC accord\nand said it would never sell below official prices.\n    The sources said total Saudi refinery throughput fell\nslightly in February to an average 1.1 mln bpd from 1.2 mln in\nJanuary because of cuts at the Yanbu and Jubail export\nrefineries.\n    They put crude oil exports through Yanbu at 100,000 bpd\nlast month, compared to zero in January, while throughput at\nBahrains refinery and neutral zone production remained steady\nat around 200,000 bpd each.\n REUTER')
crudeText <- c(crudeText, 'Deputy oil ministers from six Gulf\nArab states will meet in Bahrain today to discuss coordination\nof crude oil marketing, the official Emirates news agency WAM\nreported.\n    WAM said the officials would be discussing implementation\nof last Sundays agreement in Doha by Gulf Cooperation Council\n(GCC) oil ministers to help each other market their crude oil.\n    Four of the GCC states - Saudi Arabia, the United Arab\nEmirates (UAE), Kuwait and Qatar - are members of the\nOrganiaation of Petroleum Exporting Countries (OPEC) and some\nface stiff buyer resistance to official OPEC prices.\n Reuter')
crudeText <- c(crudeText, 'Saudi Arabian Oil Minister Hisham Nazer\nreiterated the kingdoms commitment to last Decembers OPEC\naccord to boost world oil prices and stabilize the market, the\nofficial Saudi Press Agency SPA said.\n    Asked by the agency about the recent fall in free market\noil prices, Nazer said Saudi Arabia \"is fully adhering by the\n... accord and it will never sell its oil at prices below the\npronounced prices under any circumstance.\"\n    Saudi Arabia was a main architect of December pact under\nwhich OPEC agreed to cut its total oil output ceiling by 7.25\npct and return to fixed prices of around 18 dollars a barrel.\n Reuter')
crudeText <- c(crudeText, 'Kuwaits oil minister said in a newspaper\ninterview that there were no plans for an emergency OPEC\nmeeting after the recent weakness in world oil prices.\n    Sheikh Ali al-Khalifa al-Sabah was quoted by the local\ndaily al-Qabas as saying that \"none of the OPEC members has\nasked for such a meeting.\"\n    He also denied that Kuwait was pumping above its OPEC quota\nof 948,000 barrels of crude daily (bpd).\n    Crude oil prices fell sharply last week as international\noil traders and analysts estimated the 13-nation OPEC was\npumping up to one million bpd over its self-imposed limits.\n Reuter')
crudeText <- c(crudeText, 'The port of Philadelphia was closed\nwhen a Cypriot oil tanker, Seapride II, ran aground after\nhitting a 200-foot tower supporting power lines across the\nriver, a Coast Guard spokesman said.\n    He said there was no oil spill but the ship is lodged on\nrocks opposite the Hope Creek nuclear power plant in New\nJersey.\n    He said the port would be closed until today when they\nhoped to refloat the ship on the high tide.\n    After delivering oil to a refinery in Paulsboro, New\nJersey, the ship apparently lost its steering and hit the power\ntransmission line carrying power from the nuclear plant to the\nstate of Delaware.\n Reuter')
crudeText <- c(crudeText, 'A study group said the United States\nshould increase its strategic petroleum reserve to one mln\nbarrels as one way to deal with the present and future impact\nof low oil prices on the domestic oil industry.\n    U.S. policy now is to raise the strategic reserve to 750\nmln barrels, from its present 500 mln, to help protect the\neconomy from an overseas embargo or a sharp price rise.\n    The Aspen Institute for Humanistic Studies, a private\ngroup, also called for new research for oil exploration and\ndevelopment techniques.\n    It predicted prices would remain at about 15-18 dlrs a\nbarrel for several years and then rise to the mid 20s, with\nimports at about 30 pct of U.S. consumption.\n    It said instead that such moves as increasing oil reserves\nand more exploration and development research would help to\nguard against or mitigate the risks of increased imports.\n Reuter')
crudeText <- c(crudeText, 'A study group said the United States\nshould increase its strategic petroleum reserve to one mln\nbarrels as one way to deal with the present and future impact\nof low oil prices on the domestic oil industry.\n    U.S. policy now is to raise the strategic reserve to 750\nmln barrels, from its present 500 mln, to help protect the\neconomy from an overseas embargo or a sharp price rise.\n    The Aspen Institute for Humanistic Studies, a private\ngroup, also called for new research for oil exploration and\ndevelopment techniques.\n    It predicted prices would remain at about 15-18 dlrs a\nbarrel for several years and then rise to the mid 20s, with\nimports at about 30 pct of U.S. consumption.\n    The study cited two basic policy paths for the nation: to\nprotect the U.S. industry through an import fee or other such\ndevice or to accept the full economic benefits of cheap oil.\n    But the group did not strongly back either option, saying\nthere were benefits and drawbacks to both.\n    It said instead that such moves as increasing oil reserves\nand more exploration and development research would help to\nguard against or mitigate the risks of increased imports.\n Reuter')
crudeText <- c(crudeText, 'Unocal Corps Union Oil Co said it\nlowered its posted prices for crude oil one to 1.50 dlrs a\nbarrel in the eastern region of the U.S., effective Feb 26.\n    Union said a 1.50 dlrs cut brings its posted price for the\nU.S. benchmark grade, West Texas Intermediate, to 16 dlrs.\nLouisiana Sweet also was lowered 1.50 dlrs to 16.35 dlrs, the\ncompany said.\n    No changes were made in Unions posted prices for West\nCoast grades of crude oil, the company said.\n Reuter')
crudeText <- c(crudeText, 'The New York Mercantile Exchange set\nApril one for the debut of a new procedure in the energy\ncomplex that will increase the use of energy futures worldwide.\n     On April one, NYMEX will allow oil traders that do not\nhold a futures position to initiate, after the exchange closes,\na transaction that can subsequently be hedged in the futures\nmarket, according to an exchange spokeswoman.\n    \"This will change the way oil is transacted in the real\nworld,\" said said Thomas McKiernan, McKiernan and Co chairman.\n    Foreign traders will be able to hedge trades against NYMEX\nprices before the exchange opens and negotiate prices at a\ndifferential to NYMEX prices, McKiernan explained.\n     The expanded program \"will serve the industry because the\noil market does not close when NYMEX does,\" said Frank Capozza,\nsecretary of Century Resources Inc.\n     The rule change, which has already taken effect for\nplatinum futures on NYMEX, is expected to increase the open\ninterest and liquidity in U.S. energy futures, according to\ntraders and analysts.\n    Currently, at least one trader in this transaction, called\nan exchange for physical or EFP, must hold a futures position\nbefore entering into the transaction.\n    Under the new arrangement, neither party has to hold a\nfutures position before entering into an EFP and one or both\nparties can offset their cash transaction with a futures\ncontract the next day, according to exchange officials.\n    When NYMEX announced its proposed rule change in December,\nNYMEX President Rosemary McFadden, said, \"Expansion of the EFP\nprovision will add to globalization of the energy markets by\nproviding for, in effect, 24-hour trading.\"\n    The Commodity Futures Trading Commission approved the rule\nchange in February, according to a CFTC spokeswoman.\n Reuter')
crudeText <- c(crudeText, 'Argentine crude oil production was\ndown 10.8 pct in January 1987 to 12.32 mln barrels, from 13.81\nmln barrels in January 1986, Yacimientos Petroliferos Fiscales\nsaid.\n    January 1987 natural gas output totalled 1.15 billion cubic\nmetrers, 3.6 pct higher than 1.11 billion cubic metres produced\nin January 1986, Yacimientos Petroliferos Fiscales added.\n Reuter')


crude <- tm::VCorpus(tm::VectorSource(crudeText))
NLP::meta(crude, "id") <- c('127', '144', '191', '194', '211', '236', '237', '242', '246', '248', '273', '349', '352', '353', '368', '489', '502', '543', '704', '708')

# Print out the corpus
print(crude)
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 1
## Content:  documents: 20
# Print the content of the 10th article
crude[[10]]$content
## [1] "Saudi Arabian Oil Minister Hisham Nazer\nreiterated the kingdoms commitment to last Decembers OPEC\naccord to boost world oil prices and stabilise the market, the\nofficial Saudi Press Agency SPA said.\n    Asked by the agency about the recent fall in free market\noil prices, Nazer said Saudi Arabia \"is fully adhering by the\n... Accord and it will never sell its oil at prices below the\npronounced prices under any circumstance.\"\n    Nazer, quoted by SPA, said recent pressure on free market\nprices \"may be because of the end of the (northern hemisphere)\nwinter season and the glut in the market.\"\n    Saudi Arabia was a main architect of the December accord,\nunder which OPEC agreed to lower its total output ceiling by\n7.25 pct to 15.8 mln barrels per day (bpd) and return to fixed\nprices of around 18 dlrs a barrel.\n    The agreement followed a year of turmoil on oil markets,\nwhich saw prices slump briefly to under 10 dlrs a barrel in\nmid-1986 from about 30 dlrs in late 1985. Free market prices\nare currently just over 16 dlrs.\n    Nazer was quoted by the SPA as saying Saudi Arabias\nadherence to the accord was shown clearly in the oil market.\n    He said contacts among members of OPEC showed they all\nwanted to stick to the accord.\n    In Jamaica, OPEC President Rilwanu Lukman, who is also\nNigerian Oil Minister, said the group planned to stick with the\npricing agreement.\n    \"We are aware of the negative forces trying to manipulate\nthe operations of the market, but we are satisfied that the\nfundamentals exist for stable market conditions,\" he said.\n    Kuwaits Oil Minister, Sheikh Ali al-Khalifa al-Sabah, said\nin remarks published in the emirates daily Al-Qabas there were\nno plans for an emergency OPEC meeting to review prices.\n    Traders and analysts in international oil markets estimate\nOPEC is producing up to one mln bpd above the 15.8 mln ceiling.\n    They named Kuwait and the United Arab Emirates, along with\nthe much smaller producer Ecuador, among those producing above\nquota. Sheikh Ali denied that Kuwait was over-producing.\n REUTER"
# Find the first ID
crude[[1]]$meta$id
## [1] "1"
# Make a vector of IDs
ids <- c()
for(i in c(1:20)){
    ids <- append(ids, crude[[i]]$meta$id)
}


# Create a tibble & Review
crude_tibble <- generics::tidy(crude)
names(crude_tibble)
## [1] "author"        "datetimestamp" "description"   "heading"      
## [5] "id"            "language"      "origin"        "text"
crude_counts <- crude_tibble %>%
    # Tokenize 
    tidytext::unnest_tokens(word, text) %>%
    # Count by word
    count(word, sort = TRUE) %>%
    # Remove
    anti_join(tidytext::stop_words)
## Joining, by = "word"
# Assign the top word
top_word <- "oil"


russian_tweets <- read_csv("./RInputFiles/russian_1.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   X1 = col_double(),
##   external_author_id = col_double(),
##   following = col_double(),
##   followers = col_double(),
##   updates = col_double(),
##   retweet = col_double(),
##   new_june_2018 = col_double(),
##   alt_external_id = col_double(),
##   tweet_id = col_double(),
##   tco3_step1 = col_logical()
## )
## See spec(...) for full column specifications.
## Warning: 29 parsing failures.
##  row        col           expected                                             actual                          file
## 4526 tco3_step1 1/0/T/F/TRUE/FALSE http://StopMassIncarceration.net                   './RInputFiles/russian_1.csv'
## 5281 tco3_step1 1/0/T/F/TRUE/FALSE https://www.youtube.com/watch?v=RZS59mXnKSo        './RInputFiles/russian_1.csv'
## 5703 tco3_step1 1/0/T/F/TRUE/FALSE https://twitter.com/intent/user?user_id=4352458761 './RInputFiles/russian_1.csv'
## 5763 tco3_step1 1/0/T/F/TRUE/FALSE https://goo.gl/jfulXo                              './RInputFiles/russian_1.csv'
## 6089 tco3_step1 1/0/T/F/TRUE/FALSE https://youtu.be/gQM8Bql4IpI                       './RInputFiles/russian_1.csv'
## .... .......... .................. .................................................. .............................
## See problems(...) for more details.
str(russian_tweets)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 20000 obs. of  22 variables:
##  $ X1                : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ external_author_id: num  9.06e+17 9.06e+17 9.06e+17 9.06e+17 9.06e+17 ...
##  $ author            : chr  "10_GOP" "10_GOP" "10_GOP" "10_GOP" ...
##  $ content           : chr  "\"We have a sitting Democrat US Senator on trial for corruption and you've barely heard a peep from the mainstr"| __truncated__ "Marshawn Lynch arrives to game in anti-Trump shirt. Judging by his sagging pants the shirt should say Lynch vs."| __truncated__ "Daughter of fallen Navy Sailor delivers powerful monologue on anthem protests, burns her NFL packers gear.  #Bo"| __truncated__ "JUST IN: President Trump dedicates Presidents Cup golf tournament trophy to the people of Florida, Texas and Pu"| __truncated__ ...
##  $ region            : chr  "Unknown" "Unknown" "Unknown" "Unknown" ...
##  $ language          : chr  "English" "English" "English" "English" ...
##  $ publish_date      : chr  "10/1/2017 19:58" "10/1/2017 22:43" "10/1/2017 22:50" "10/1/2017 23:52" ...
##  $ harvested_date    : chr  "10/1/2017 19:59" "10/1/2017 22:43" "10/1/2017 22:51" "10/1/2017 23:52" ...
##  $ following         : num  1052 1054 1054 1062 1050 ...
##  $ followers         : num  9636 9637 9637 9642 9645 ...
##  $ updates           : num  253 254 255 256 246 247 248 249 250 251 ...
##  $ post_type         : chr  NA NA "RETWEET" NA ...
##  $ account_type      : chr  "Right" "Right" "Right" "Right" ...
##  $ retweet           : num  0 0 1 0 1 0 1 0 0 0 ...
##  $ account_category  : chr  "RightTroll" "RightTroll" "RightTroll" "RightTroll" ...
##  $ new_june_2018     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ alt_external_id   : num  9.06e+17 9.06e+17 9.06e+17 9.06e+17 9.06e+17 ...
##  $ tweet_id          : num  9.15e+17 9.15e+17 9.15e+17 9.15e+17 9.14e+17 ...
##  $ article_url       : chr  "http://twitter.com/905874659358453760/statuses/914580356430536707" "http://twitter.com/905874659358453760/statuses/914621840496189440" "http://twitter.com/905874659358453760/statuses/914623490375979008" "http://twitter.com/905874659358453760/statuses/914639143690555392" ...
##  $ tco1_step1        : chr  "https://twitter.com/10_gop/status/914580356430536707/video/1" "https://twitter.com/damienwoody/status/914568524449959937/video/1" "https://twitter.com/10_gop/status/913231923715198976/video/1" "https://twitter.com/10_gop/status/914639143690555392/video/1" ...
##  $ tco2_step1        : chr  NA NA NA NA ...
##  $ tco3_step1        : logi  NA NA NA NA NA NA ...
##  - attr(*, "problems")=Classes 'tbl_df', 'tbl' and 'data.frame': 29 obs. of  5 variables:
##   ..$ row     : int  4526 5281 5703 5763 6089 6098 6119 6238 6903 7516 ...
##   ..$ col     : chr  "tco3_step1" "tco3_step1" "tco3_step1" "tco3_step1" ...
##   ..$ expected: chr  "1/0/T/F/TRUE/FALSE" "1/0/T/F/TRUE/FALSE" "1/0/T/F/TRUE/FALSE" "1/0/T/F/TRUE/FALSE" ...
##   ..$ actual  : chr  "http://StopMassIncarceration.net" "https://www.youtube.com/watch?v=RZS59mXnKSo" "https://twitter.com/intent/user?user_id=4352458761" "https://goo.gl/jfulXo" ...
##   ..$ file    : chr  "'./RInputFiles/russian_1.csv'" "'./RInputFiles/russian_1.csv'" "'./RInputFiles/russian_1.csv'" "'./RInputFiles/russian_1.csv'" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   X1 = col_double(),
##   ..   external_author_id = col_double(),
##   ..   author = col_character(),
##   ..   content = col_character(),
##   ..   region = col_character(),
##   ..   language = col_character(),
##   ..   publish_date = col_character(),
##   ..   harvested_date = col_character(),
##   ..   following = col_double(),
##   ..   followers = col_double(),
##   ..   updates = col_double(),
##   ..   post_type = col_character(),
##   ..   account_type = col_character(),
##   ..   retweet = col_double(),
##   ..   account_category = col_character(),
##   ..   new_june_2018 = col_double(),
##   ..   alt_external_id = col_double(),
##   ..   tweet_id = col_double(),
##   ..   article_url = col_character(),
##   ..   tco1_step1 = col_character(),
##   ..   tco2_step1 = col_character(),
##   ..   tco3_step1 = col_logical()
##   .. )
# Create a corpus
tweet_corpus <- tm::VCorpus(tm::VectorSource(russian_tweets$content))

# Attach following and followers
NLP::meta(tweet_corpus, 'following') <- russian_tweets$following
NLP::meta(tweet_corpus, 'followers') <- russian_tweets$followers

# Review the meta data
head(NLP::meta(tweet_corpus))
##   following followers
## 1      1052      9636
## 2      1054      9637
## 3      1054      9637
## 4      1062      9642
## 5      1050      9645
## 6      1050      9644
# Count occurrence by question and word
words <- crude_tibble %>%
    tidytext::unnest_tokens(output = "word", token = "words", input = text) %>%
    anti_join(tidytext::stop_words) %>%
    count(id, word, sort=TRUE)
## Joining, by = "word"
# How  different word/article combinations are there?
unique_combinations <- nrow(words)

# Filter to responses with the word "prices"
words %>%
    filter(word == "prices")
## # A tibble: 15 x 3
##    id    word       n
##    <chr> <chr>  <int>
##  1 10    prices     9
##  2 11    prices     5
##  3 13    prices     5
##  4 2     prices     5
##  5 6     prices     5
##  6 1     prices     3
##  7 19    prices     3
##  8 14    prices     2
##  9 16    prices     2
## 10 17    prices     2
## 11 18    prices     2
## 12 8     prices     2
## 13 12    prices     1
## 14 7     prices     1
## 15 9     prices     1
# How many articles had the word "prices"?
number_of_price_articles <- 15


# Tokenize and remove stop words
tidy_tweets <- russian_tweets %>%
    tidytext::unnest_tokens(word, content) %>%
    anti_join(tidytext::stop_words)
## Joining, by = "word"
# Count by word
unique_words <- tidy_tweets %>%
    count(word)

# Count by tweet (tweet_id) and word
unique_words_by_tweet <- tidy_tweets %>%
    count(tweet_id, word)

# Find the size of matrix: rows x columns
size <- nrow(russian_tweets) * length(unique(tidy_tweets$word))

percent <- nrow(unique_words_by_tweet) / size
percent
## [1] 0.0002028352
# Create a tibble with TFIDF values
crude_weights <- crude_tibble %>%
    tidytext::unnest_tokens(output = "word", token = "words", input = text) %>%
    anti_join(tidytext::stop_words) %>%
    count(id, word) %>%
    tidytext::bind_tf_idf(word, id, n)
## Joining, by = "word"
# Find the highest TFIDF values
crude_weights %>%
    arrange(desc(tf_idf))
## # A tibble: 1,494 x 6
##    id    word         n     tf   idf tf_idf
##    <chr> <chr>    <int>  <dbl> <dbl>  <dbl>
##  1 20    january      4 0.0930  2.30  0.214
##  2 15    power        4 0.0690  3.00  0.207
##  3 19    futures      9 0.0643  3.00  0.193
##  4 8     8            6 0.0619  3.00  0.185
##  5 3     canada       2 0.0526  3.00  0.158
##  6 3     canadian     2 0.0526  3.00  0.158
##  7 15    ship         3 0.0517  3.00  0.155
##  8 19    nymex        7 0.05    3.00  0.150
##  9 20    cubic        2 0.0465  3.00  0.139
## 10 20    fiscales     2 0.0465  3.00  0.139
## # ... with 1,484 more rows
# Find the lowest non-zero TFIDF values
crude_weights %>%
    filter(tf_idf != 0) %>%
    arrange(tf_idf)
## # A tibble: 1,454 x 6
##    id    word          n      tf   idf  tf_idf
##    <chr> <chr>     <int>   <dbl> <dbl>   <dbl>
##  1 7     prices        1 0.00452 0.288 0.00130
##  2 9     prices        1 0.00521 0.288 0.00150
##  3 7     dlrs          1 0.00452 0.598 0.00271
##  4 7     opec          1 0.00452 0.693 0.00314
##  5 9     opec          1 0.00521 0.693 0.00361
##  6 7     mln           1 0.00452 0.799 0.00361
##  7 7     petroleum     1 0.00452 0.799 0.00361
##  8 11    petroleum     1 0.00455 0.799 0.00363
##  9 6     barrels       1 0.00429 0.916 0.00393
## 10 6     industry      1 0.00429 0.916 0.00393
## # ... with 1,444 more rows
# Create word counts
animal_farm_counts <- animal_farm %>%
    tidytext::unnest_tokens(word, text_column) %>%
    count(chapter, word)

# Calculate the cosine similarity 
comparisons <- animal_farm_counts %>%
    widyr::pairwise_similarity(chapter, word, n) %>%
    arrange(desc(similarity))

# Print the mean of the similarity values
comparisons %>%
    summarize(mean = mean(similarity))  # very high similarities due to stop words
## # A tibble: 1 x 1
##    mean
##   <dbl>
## 1 0.949
# Create word counts 
animal_farm_counts <- animal_farm %>%
    tidytext::unnest_tokens(word, text_column) %>%
    anti_join(tidytext::stop_words) %>%
    count(chapter, word) %>%
    tidytext::bind_tf_idf(chapter, word, n)
## Joining, by = "word"
# Calculate cosine similarity on word counts
animal_farm_counts %>%
    widyr::pairwise_similarity(chapter, word, n) %>%
    arrange(desc(similarity))
## # A tibble: 90 x 3
##    item1      item2      similarity
##    <chr>      <chr>           <dbl>
##  1 Chapter 8  Chapter 7       0.696
##  2 Chapter 7  Chapter 8       0.696
##  3 Chapter 7  Chapter 5       0.693
##  4 Chapter 5  Chapter 7       0.693
##  5 Chapter 8  Chapter 5       0.642
##  6 Chapter 5  Chapter 8       0.642
##  7 Chapter 7  Chapter 6       0.641
##  8 Chapter 6  Chapter 7       0.641
##  9 Chapter 6  Chapter 10      0.638
## 10 Chapter 10 Chapter 6       0.638
## # ... with 80 more rows
# Calculate cosine similarity using tf_idf values
animal_farm_counts %>%
    widyr::pairwise_similarity(chapter, word, tf_idf) %>%
    arrange(desc(similarity))
## # A tibble: 90 x 3
##    item1      item2      similarity
##    <chr>      <chr>           <dbl>
##  1 Chapter 8  Chapter 7      0.0580
##  2 Chapter 7  Chapter 8      0.0580
##  3 Chapter 9  Chapter 8      0.0525
##  4 Chapter 8  Chapter 9      0.0525
##  5 Chapter 7  Chapter 5      0.0467
##  6 Chapter 5  Chapter 7      0.0467
##  7 Chapter 9  Chapter 10     0.0446
##  8 Chapter 10 Chapter 9      0.0446
##  9 Chapter 9  Chapter 7      0.0432
## 10 Chapter 7  Chapter 9      0.0432
## # ... with 80 more rows

Chapter 3 - Applications: Classification and Topic Modeling

Preparing Text for Modeling:

  • Two common text analysis techniques include text classification and topic modeling
  • Classification modeling is a type of supervised learning - separation in to unique categories
    • Example of classifying sentences about Napoleon vs. Boxer
    • sentences <- animal_farm %>% unnest_tokens(output=“sentence”, token=“sentences”, input=text_column)
    • sentences\(boxer <- grepl("boxer", sentences\)sentence)
    • sentences\(napoleon <- grepl("napoleon", sentences\)sentence)
    • sentences\(sentence <- gsub("boxer", "animal X", sentences\)sentence)
    • sentences\(sentence <- gsub("napoleon", "animal X", sentences\)sentence)
    • animal_sentences <- sentences[(sentences\(boxer + sentences\)napoleon) == 1, ]
    • animal_sentences\(Name <- as.factor(ifelse(animal_sentences\)boxer, “boxer”, “napoleon”))
    • animal_sentences <- rbind(animal_sentences[animal_sentences$Name == “boxer”, ][1:75, ], animal_sentences[animal_sentences$Name == “napoleon”, ][1:75, ])
    • animal_sentences$sentence_id <- c(1:dim(animal_sentences)[1])
  • With data prepared, can attempt to predict which sentence refers to each of the animals
    • animal_tokens <- animal_sentences %>% unnest_tokens(output=“word”, token=“words”, input=sentence) %>% anti_join(stop_words) %>% mutate(word=wordStem(word))
    • animal_matrix <- animal_tokens %>% count(sentence_id, word) %>% tidytext::cast_dtm(document=sentence_id, term=word, value=n, weighting=tm::weightTfIdf)
    • removeSparseTerms(animal_matrix, sparse=0.90) # reduces to 66% sparisty from 99%+ sparsity
    • Decisions on optimal amount of sparsity depend on computing power as well as number of terms needed for good classification

Classification Modeling:

  • Can split data in many ways, including use of sample()
    • set.seed(1111)
    • sample_size <- floor(0.8 * nrow(animal_matrix))
    • train_ind <- sample(nrow(animal_matrix), size=sample_size)
    • train <- animal_matrix[train_ind, ]
    • test <- animal_matrix[-train_ind, ]
  • Can use the random forest classification techniques
    • rfc <- randomForest::randomForest(x=as.data.frame(as.matrix(train)), y=animal_sentences$Name[train_ind], nTree=50)
    • y_pred <- predict(rfc, newdata=as.data.frame(as.matrix(test))
    • table(animal_sentences[-train_id]$Name, y_pred) # confusion matrix

Introduction to Topic Modeling:

  • Text is typically made up of a collection of topics
    • Documents are a mix of topics
    • Topics are a mix of words
  • Can prepare for LDA to assist in topic modeling
    • animal_farm_tokens <- animal_farm %>% unnest_tokens(output=“word”, token=“words”, input=text_column) %>% anti_join(stop_words) %>% mutate(word=wordStem(word))
    • animal_farm_matrix <- animal_farm_tokens %>% count(chapter, word) %>% cast_dtm(document=chapter, term=word, value=n, weighting=tm::weightTf) # LDA requires tf rather than tf-idf
    • animal_farm_lda <- LDA(train, k=4, method=“Gibbs”, control=list(seed=1111))
    • animal_farm_betas <- tidy(animal_farm_lda, matrix=“beta”) # beta is a per-topic metric (words more related to a single topic should have a higher beta)
  • Can then label documents as topics
    • animal_farm_chapters <- tidy(animal_farm_lda, matrix=“gamma”) # gamma represents how much each chapter is made up of a topic
    • animal_farm_chapters %>% filter(document == “Chapter 1”)

LDA in Practice:

  • Need to select the number of topics as an input to LDA - can use perplexity as a metric
  • Perplexity is a model of how well a model fits new data - lower is better
    • Start with the standard test/train data splitting
    • Can create an lda_model for many different k and store the resulting perplexity(lda_model, newdata=test)
    • Can plot and then look for elbows or flattening points where the perplexity is no longer decreasing
  • Practical considerations are important also - sometimes fewer topics are easier to communicate and comprehend, even at the sacrifice of some excess perplexity
    • Common to have a SME review some of the top articles and words, and then to provide a theme for each topic
  • Can run summaries of the output
    • gammas <- tidy(lda_model, matrix=“gamma”)
    • gammas %>% group_by(document) %>% arrange(desc(gamma)) %>% slice(1) %>% group_by(topic) %>% tally(topic, sort=TRUE)
    • gammas %>% group_by(document) %>% arrange(desc(gamma)) %>% slice(1) %>% group_by(topic) %>% summarize(avg=mean(gamma)) %>% arrange(desc(avg))

Example code includes:

# Stem the tokens
russian_tokens <- russian_tweets %>%
    tidytext::unnest_tokens(output = "word", token = "words", input = content) %>%
    anti_join(tidytext::stop_words) %>%
    mutate(word = SnowballC::wordStem(word))
## Joining, by = "word"
# Create a document term matrix 
tweet_matrix <- russian_tokens %>%
    count(tweet_id, word) %>%
    tidytext::cast_dtm(document = tweet_id, term = word, value = n, weighting = tm::weightTfIdf)

# Print the matrix details 
tweet_matrix
## <<DocumentTermMatrix (documents: 19986, terms: 38909)>>
## Non-/sparse entries: 176707/777458567
## Sparsity           : 100%
## Maximal term length: 37
## Weighting          : term frequency - inverse document frequency (normalized) (tf-idf)
less_sparse_matrix <- tm::removeSparseTerms(tweet_matrix, sparse = 0.5)

# Print results
tweet_matrix
## <<DocumentTermMatrix (documents: 19986, terms: 38909)>>
## Non-/sparse entries: 176707/777458567
## Sparsity           : 100%
## Maximal term length: 37
## Weighting          : term frequency - inverse document frequency (normalized) (tf-idf)
less_sparse_matrix
## <<DocumentTermMatrix (documents: 19986, terms: 2)>>
## Non-/sparse entries: 27527/12445
## Sparsity           : 31%
## Maximal term length: 4
## Weighting          : term frequency - inverse document frequency (normalized) (tf-idf)
less_sparse_matrix <- tm::removeSparseTerms(tweet_matrix, sparse = 0.9)

# Print results
tweet_matrix
## <<DocumentTermMatrix (documents: 19986, terms: 38909)>>
## Non-/sparse entries: 176707/777458567
## Sparsity           : 100%
## Maximal term length: 37
## Weighting          : term frequency - inverse document frequency (normalized) (tf-idf)
less_sparse_matrix
## <<DocumentTermMatrix (documents: 19986, terms: 2)>>
## Non-/sparse entries: 27527/12445
## Sparsity           : 31%
## Maximal term length: 4
## Weighting          : term frequency - inverse document frequency (normalized) (tf-idf)
less_sparse_matrix <- tm::removeSparseTerms(tweet_matrix, sparse = 0.99)

# Print results
tweet_matrix
## <<DocumentTermMatrix (documents: 19986, terms: 38909)>>
## Non-/sparse entries: 176707/777458567
## Sparsity           : 100%
## Maximal term length: 37
## Weighting          : term frequency - inverse document frequency (normalized) (tf-idf)
less_sparse_matrix
## <<DocumentTermMatrix (documents: 19986, terms: 56)>>
## Non-/sparse entries: 48853/1070363
## Sparsity           : 96%
## Maximal term length: 14
## Weighting          : term frequency - inverse document frequency (normalized) (tf-idf)
less_sparse_matrix <- tm::removeSparseTerms(tweet_matrix, sparse =0.9999)

# Print results
tweet_matrix
## <<DocumentTermMatrix (documents: 19986, terms: 38909)>>
## Non-/sparse entries: 176707/777458567
## Sparsity           : 100%
## Maximal term length: 37
## Weighting          : term frequency - inverse document frequency (normalized) (tf-idf)
less_sparse_matrix
## <<DocumentTermMatrix (documents: 19986, terms: 9566)>>
## Non-/sparse entries: 147364/191038712
## Sparsity           : 100%
## Maximal term length: 27
## Weighting          : term frequency - inverse document frequency (normalized) (tf-idf)
set.seed(2001021530)

rightTweet <- russian_tweets %>%
    filter(account_type=="Right") %>%
    sample_n(2000)

leftTweet <- russian_tweets %>%
    filter(account_type=="Left") %>%
    sample_n(2000)


idx <- sample(1:4000, 4000, replace=FALSE)

leftRightData <- rbind(rightTweet, leftTweet)[idx, ]


leftRight_tokens <- leftRightData %>%
    tidytext::unnest_tokens(output = "word", token = "words", input = content) %>%
    anti_join(tidytext::stop_words) %>%
    mutate(word = SnowballC::wordStem(word))
## Joining, by = "word"
# Create a document term matrix 
left_right_matrix_small <- leftRight_tokens %>%
    count(tweet_id, word) %>%
    tidytext::cast_dtm(document = tweet_id, term = word, value = n, weighting = tm::weightTfIdf) %>%
    tm::removeSparseTerms(sparse = 0.99)

left_right_labels <- c()
for (lbl in rownames(as.matrix(left_right_matrix_small))) {
    newPoint <- leftRightData %>%
        filter(tweet_id==lbl) %>%
        pull(account_type)
    left_right_labels <- c(left_right_labels, newPoint)
}
left_right_labels <- as.factor(left_right_labels)


# Create train/test split
set.seed(1111)
sample_size <- floor(0.75 * nrow(left_right_matrix_small))
train_ind <- sample(nrow(left_right_matrix_small), size = sample_size)
train <- left_right_matrix_small[train_ind, ]
test <- left_right_matrix_small[-train_ind, ]

# Create a random forest classifier
rfc <- randomForest::randomForest(x = as.data.frame(as.matrix(train)), 
                                  y = left_right_labels[train_ind], nTree = 50
                                  )

# Print the results
rfc
## 
## Call:
##  randomForest(x = as.data.frame(as.matrix(train)), y = left_right_labels[train_ind],      nTree = 50) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 8
## 
##         OOB estimate of  error rate: 19.51%
## Confusion matrix:
##       Left Right class.error
## Left  1369   121  0.08120805
## Right  464  1044  0.30769231
# Percentage correctly labeled "Left"
# left <- (350) / (350 + 157)
# left

# Percentage correctly labeled "Right"
# right <- (436) / (436 + 57)
# right

# Overall Accuracy:
# accuracy <- (350 + 436) / (350 + 436 + 57 + 157)
# accuracy


napolSents <- animal_farm %>%
    tidytext::unnest_tokens(output = "sentences", input = text_column, token = "sentences") %>%
    mutate(sentence_id=row_number(), napoleon=str_detect(sentences, 'napoleon')) %>%
    filter(napoleon)

pig_tokens <- napolSents %>%
    tidytext::unnest_tokens(output = "word", token = "words", input = sentences) %>%
    anti_join(tidytext::stop_words) %>%
    mutate(word = SnowballC::wordStem(word))
## Joining, by = "word"
# Create a document term matrix 
pig_matrix <- pig_tokens %>%
    count(sentence_id, word) %>%
    tidytext::cast_dtm(document = sentence_id, term = word, value = n, weighting = tm::weightTf) %>%
    tm::removeSparseTerms(sparse=0.995)

# Perform Topic Modeling
sentence_lda <-
    topicmodels::LDA(pig_matrix, k = 10, method = 'Gibbs', control = list(seed = 1111))

# Extract the beta matrix 
sentence_betas <- generics::tidy(sentence_lda, matrix = "beta")

# Topic #2
sentence_betas %>%
    filter(topic == 2) %>%
    arrange(-beta)
## # A tibble: 859 x 3
##    topic term        beta
##    <int> <chr>      <dbl>
##  1     2 anim      0.0716
##  2     2 comrad    0.0678
##  3     2 windmil   0.0266
##  4     2 snowball' 0.0191
##  5     2 squealer  0.0191
##  6     2 cri       0.0154
##  7     2 forward   0.0154
##  8     2 walk      0.0116
##  9     2 hear      0.0116
## 10     2 command   0.0116
## # ... with 849 more rows
# Topic #10
sentence_betas %>%
    filter(topic == 3) %>%
    arrange(-beta)
## # A tibble: 859 x 3
##    topic term         beta
##    <int> <chr>       <dbl>
##  1     3 napoleon' 0.0524 
##  2     3 usual     0.0221 
##  3     3 black     0.0178 
##  4     3 moment    0.0178 
##  5     3 pronounc  0.0178 
##  6     3 effort    0.0134 
##  7     3 gun       0.0134 
##  8     3 boar      0.00909
##  9     3 cast      0.00909
## 10     3 pig       0.00909
## # ... with 849 more rows
# Extract the beta and gamma matrices
sentence_betas <- generics::tidy(sentence_lda, matrix = "beta")
sentence_gammas <- generics::tidy(sentence_lda, matrix = "gamma")

# Explore Topic 5 Betas
sentence_betas %>%
    filter(topic == 5) %>%
    arrange(-beta)
## # A tibble: 859 x 3
##    topic term        beta
##    <int> <chr>      <dbl>
##  1     5 appear    0.0386
##  2     5 leader    0.0271
##  3     5 stood     0.0233
##  4     5 half      0.0157
##  5     5 agreement 0.0157
##  6     5 voic      0.0157
##  7     5 abolish   0.0118
##  8     5 time      0.0118
##  9     5 death     0.0118
## 10     5 drink     0.0118
## # ... with 849 more rows
# Explore Topic 5 Gammas
sentence_gammas %>%
    filter(topic == 5) %>%
    arrange(-gamma)
## # A tibble: 157 x 3
##    document topic gamma
##    <chr>    <int> <dbl>
##  1 1074         5 0.157
##  2 954          5 0.153
##  3 1152         5 0.152
##  4 1370         5 0.151
##  5 225          5 0.149
##  6 1518         5 0.148
##  7 1171         5 0.147
##  8 1355         5 0.140
##  9 1521         5 0.135
## 10 968          5 0.133
## # ... with 147 more rows
# Print the topic setence for topic 5
napolSents$sentences[which(napolSents$sentence_id == (sentence_gammas %>% group_by(topic) %>% 
                                                          top_n(1, gamma) %>% filter(topic==5) %>% 
                                                          pull(document) %>% as.numeric()
                                                      )
                           )
                    ]
## [1] "then a sheep confessed to having urinated in the drinking pool--urged to do this, so she said, by snowball--and two other sheep confessed to having murdered an old ram, an especially devoted follower of napoleon, by chasing him round and round a bonfire when he was suffering from a cough."
right_tokens <- rightTweet %>%
    tidytext::unnest_tokens(output = "word", token = "words", input = content) %>%
    anti_join(tidytext::stop_words) %>%
    mutate(word = SnowballC::wordStem(word))
## Joining, by = "word"
# Create a document term matrix 
right_matrix <- right_tokens %>%
    count(tweet_id, word) %>%
    tidytext::cast_dtm(document = tweet_id, term = word, value = n, weighting = tm::weightTf)


# Setup train and test data
sample_size <- floor(0.90 * nrow(right_matrix))
set.seed(1111)
train_ind <- sample(nrow(right_matrix), size = sample_size)
train <- right_matrix[train_ind, ]
test <- right_matrix[-train_ind, ]

# Peform topic modeling 
lda_model <- topicmodels::LDA(train, k = 5, method = "Gibbs",control = list(seed = 1111))

# Train
topicmodels::perplexity(lda_model, newdata = train) 
## [1] 596.7166
# Test
topicmodels::perplexity(lda_model, newdata = test) 
## [1] 854.6082
# Extract the gamma matrix 
gamma_values <- generics::tidy(sentence_lda, matrix = "gamma")

# Create grouped gamma tibble
grouped_gammas <- gamma_values %>%
    group_by(document) %>%
    arrange(desc(gamma)) %>%
    slice(1) %>%
    group_by(topic)
    
# Count by topic
grouped_gammas %>% 
    tally(topic, sort=TRUE)
## # A tibble: 10 x 2
##    topic     n
##    <int> <int>
##  1     9    99
##  2     8    96
##  3     5    85
##  4    10    80
##  5     6    66
##  6     2    64
##  7     7    63
##  8     4    60
##  9     3    54
## 10     1    24
# Average topic weight for top topic for each sentence
grouped_gammas %>% 
    summarize(avg=mean(gamma)) %>%
    arrange(desc(avg))
## # A tibble: 10 x 2
##    topic   avg
##    <int> <dbl>
##  1     9 0.144
##  2     7 0.137
##  3     1 0.136
##  4     8 0.135
##  5     5 0.135
##  6     6 0.135
##  7     4 0.133
##  8    10 0.132
##  9     2 0.130
## 10     3 0.130

Chapter 4 - Advanced Techniques

Sentiment Analysis:

  • Sentiment analysis extracts sentiments from the text, using a dictionary that scores words by sentiment (either a mapping or a numeric)
    • tidytext::sentiments # word-sentiment-lexicon-score
    • AFINN - scores from -5 (negative) to +5 (positive)
    • bing - simple boolean for negative/positive
    • nrc - labels words as fear, joy, anger, etc.
    • tidytext::get_sentiments(“nrc”)
  • Preparatory steps are merely to tokenize the data
    • animal_farm_tokens <- animal_farm %>% unnest_tokens(output=“word”, token=“words”, input=text_column) %>% anti_join(stop_words)
    • animal_farm_tokens %>% inner_join(get_sentiments(“afinn”)) %>% group_by(chapter) %>% summarize(sentiment=mean(score)) %>% arrange(sentiment)

Word Embeddings:

  • One of the goals is to get at word meanings - for example, “smartest” and “most brilliant” should be considered synonyms rather than different words
  • The word2vec represents words as a large vector space, where words that are similar lie close to one another (captures multiple similarities)
    • library(h2o)
    • h2o.init()
    • h2o_object <- as.h2o(animal_farm)
    • words <- h2o.tokenize(h2o_object$text_column, “\\W+”)
    • words <- h2o.tolower(words)
    • words <- words[is.na(words) || (!words %in% stop_words$word), ]
    • word2vec_model <- h2o.word2vec(words, min_word_freq=5, epochs=5) # 5 is a small number of epochs
    • h2o.findSynonyms(w2v.model, “animal”)
    • h2o.findSynonyms(w2v.model, “jones”)
  • Additional uses include classification modeling, sentiment analysis, topic modeling, and the like

Additional NLP Analysis:

  • BERT (bidirectional encoder representations from transformers) is a pre-trained model used in transfer learning for NLP
    • Requires only a small amount of labelled data for a specific supervised learning task
  • ERNIE (enhanced representation through knowledge integration) is also imporiving NLP analysis
  • NER (named entity recognition) attemps to classify names within text - extraction, recommendations, search algorithms
  • POS (part of speech) tagging - tag words with the proper part of speech (noun, verb, adjective, etc.)

Wrap Up:

  • Text analysis techniques
  • Preparaing data for analysis, including appropriate formats
  • Common text analysis techniques
  • State-of-the-art techniques available today

Example code includes:

# Print the lexicon
tidytext::get_sentiments("bing")
## # A tibble: 6,786 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ... with 6,776 more rows
# Count the different sentiment types
tidytext::get_sentiments("bing") %>%
    count(sentiment) %>%
    arrange(desc(n))
## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005
# Count the different sentiment types
tidytext::get_sentiments("loughran") %>%
    count(sentiment) %>%
    arrange(desc(n))
## # A tibble: 6 x 2
##   sentiment        n
##   <chr>        <int>
## 1 negative      2355
## 2 litigious      904
## 3 positive       354
## 4 uncertainty    297
## 5 constraining   184
## 6 superfluous     56
# Count how many times each score was used
tidytext::get_sentiments("afinn") %>%
    count(value) %>%
    arrange(desc(n))
## # A tibble: 11 x 2
##    value     n
##    <dbl> <int>
##  1    -2   966
##  2     2   448
##  3    -1   309
##  4    -3   264
##  5     1   208
##  6     3   172
##  7     4    45
##  8    -4    43
##  9    -5    16
## 10     5     5
## 11     0     1
afSents <- animal_farm %>%
    tidytext::unnest_tokens(output = "sentence", input = text_column, token = "sentences") %>%
    mutate(sentence_id=row_number())

# Print the overall sentiment associated with each pig's sentences
for(name in c("napoleon", "snowball", "squealer")) {
    # Filter to the sentences mentioning the pig
    pig_sentences <- afSents[grepl(name, afSents$sentence), ]
    # Tokenize the text
    temp_tokens <- pig_sentences %>%
        tidytext::unnest_tokens(output = "word", token = "words", input = sentence) %>%
        anti_join(tidytext::stop_words)
    # Use afinn to find the overall sentiment score
    result <- temp_tokens %>% 
        inner_join(tidytext::get_sentiments("afinn")) %>%
        summarise(sentiment = sum(value))
    # Print the result
    print(paste0(name, ": ", result$sentiment))
}
## Joining, by = "word"
## Joining, by = "word"
## [1] "napoleon: -45"
## Joining, by = "word"
## Joining, by = "word"
## [1] "snowball: -77"
## Joining, by = "word"
## Joining, by = "word"
## [1] "squealer: -30"
left_tokens <- russian_tweets %>%
    filter(account_type=="Left") %>%
    tidytext::unnest_tokens(output = "word", token = "words", input = content) %>%
    anti_join(tidytext::stop_words)
## Joining, by = "word"
# Dictionaries 
# anticipation <- tidytext::get_sentiments("bing") %>% 
#     filter(sentiment == "anticipation")

# joy <- tidytext::get_sentiments("nrc") %>% 
#     filter(sentiment == "joy")

# Print top words for Anticipation and Joy
# left_tokens %>%
#     inner_join(anticipation, by = "word") %>%
#     count(word, sort = TRUE)

# left_tokens %>%
#     inner_join(joy, by = "word") %>%
#     count(word, sort = TRUE)


# Initialize a h2o session
library(h2o)
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
h2o.init()
## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     C:\Users\Dave\AppData\Local\Temp\RtmpIpRWr8/h2o_Dave_started_from_r.out
##     C:\Users\Dave\AppData\Local\Temp\RtmpIpRWr8/h2o_Dave_started_from_r.err
## 
## 
## Starting H2O JVM and connecting: . Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         8 seconds 727 milliseconds 
##     H2O cluster timezone:       America/Chicago 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.28.0.2 
##     H2O cluster version age:    2 months and 17 days  
##     H2O cluster name:           H2O_started_from_R_Dave_ibx518 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   4.37 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4 
##     R Version:                  R version 3.6.2 (2019-12-12)
# Create an h2o object for left_right
h2o_object = as.h2o(leftRightData)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
# Tokenize the words from the column of text in left_right
tweet_words <- h2o.tokenize(h2o_object$content, "\\\\W+")

# Lowercase and remove stopwords
tweet_words <- h2o.tolower(tweet_words)
tweet_words = tweet_words[is.na(tweet_words) || (!tweet_words %in% tidytext::stop_words$word),]
tweet_words
##                 C1
## 1          defense
## 2            black
## 3         heritage
## 4            https
## 5       in62blh02i
## 6 blacklivesmatter
## 
## [43868 rows x 1 column]
# set.seed(1111)

# Use 33% of the available data
# sample_size <- floor(0.33 * nrow(job_titles))
# sample_data <- sample(nrow(job_titles), size = sample_size)

# h2o_object = as.h2o(job_titles[sample_data, ])
# words <- h2o.tokenize(h2o_object$jobtitle, "\\\\W+")
# words <- h2o.tolower(words)
# words = words[is.na(words) || (!words %in% stop_words$word),]

# word2vec_model <- h2o.word2vec(words, min_word_freq=5, epochs = 10)

# Find synonyms for the word "teacher"
# h2o.findSynonyms(word2vec_model, "teacher", count=10)


# a: Labels each word within text as either a noun, verb, adjective, or other category.
# b: A model pre-trained on a vast amount of text data to create a language representation used for supervised learning.
# c: A type of analysis that looks to describe text as either positive or negative and can be used to find active vs passive terms.
# d: A modeling technique used to label entire text into a single category such as relevant or not-relevant.

# Sentiment Analysis
# SA <- c

# Classifcation Modeling
# CM <- d

# BERT
# BERT <- b

# Part-of-speech Tagging
# POS <- a


# e: Modeling techniques, including LDA, used to cluster text into groups or types based on similar words being used.
# f: A method for searching through text and tagging words that distinguish people, locations, or organizations.
# g: Method used to search text for specific patterns.
# h: Representing words using a large vector space where similar words are close together within the vector space.

# Named Entity Recognition
# NER <- f

# Topic Modeling
# TM <- e

# Word Embeddings 
# WE <- h

# Regular Expressions
# REGEX <- g

Joining Data with dplyr

Chapter 1 - Joining Tables

The inner_join verb:

  • Joining tables together can be important for analysis
    • sets %>% inner_join(themes, by=c(“theme_id”=“id”)) # use sets.theme_id==themes.id for merging
    • sets %>% inner_join(themes, by=c(“theme_id”=“id”), suffix=c("_set“,”_theme")) # use sets.theme_id==themes.id for merging; use _set and _theme as suffixes for any common names

Joining with a one-to-many relationship:

  • Can have multiple records per id, which results in a one-to-many join (4,000 records can expand to 4,100 records after the one-to-many even with an inner_join)

Joining three or more tables:

  • Can join three or more tables using chaining
    • sets %>% inner_join(inventories, by=“set_num”) %>% inner_join(themes, by=c(“theme_id”=“id”), suffix=c("_set“,”_theme"))
  • Each join will typically have different by arguments and suffix arguments

Example code includes:

parts <- readRDS("./RInputFiles/parts.rds")
part_categories <- readRDS("./RInputFiles/part_categories.rds")
inventory_parts <- readRDS("./RInputFiles/inventory_parts.rds")
inventories <- readRDS("./RInputFiles/inventories.rds")
sets <- readRDS("./RInputFiles/sets.rds")
themes <- readRDS("./RInputFiles/themes.rds")
colors <- readRDS("./RInputFiles/colors.rds")


# Use the suffix argument to replace .x and .y suffixes
parts %>% 
    inner_join(part_categories, by = c("part_cat_id" = "id"), suffix=c("_part", "_category"))
## # A tibble: 17,501 x 4
##    part_num   name_part                             part_cat_id name_category   
##    <chr>      <chr>                                       <dbl> <chr>           
##  1 0901       Baseplate 16 x 30 with Set 080 Yello~           1 Baseplates      
##  2 0902       Baseplate 16 x 24 with Set 080 Small~           1 Baseplates      
##  3 0903       Baseplate 16 x 24 with Set 080 Red H~           1 Baseplates      
##  4 0904       Baseplate 16 x 24 with Set 080 Large~           1 Baseplates      
##  5 1          Homemaker Bookcase 2 x 4 x 4                    7 Containers      
##  6 10016414   Sticker Sheet #1 for 41055-1                   58 Stickers        
##  7 10026stk01 Sticker for Set 10026 - (44942/41841~          58 Stickers        
##  8 10039      Pullback Motor 8 x 4 x 2/3                     44 Mechanical      
##  9 10048      Minifig Hair Tousled                           65 Minifig Headwear
## 10 10049      Minifig Shield Broad with Spiked Bot~          27 Minifig Accesso~
## # ... with 17,491 more rows
# Combine the parts and inventory_parts tables
parts %>%
    inner_join(inventory_parts, by=c("part_num"))
## # A tibble: 258,958 x 6
##    part_num name                      part_cat_id inventory_id color_id quantity
##    <chr>    <chr>                           <dbl>        <dbl>    <dbl>    <dbl>
##  1 0901     Baseplate 16 x 30 with S~           1         1973        2        1
##  2 0902     Baseplate 16 x 24 with S~           1         1973        2        1
##  3 0903     Baseplate 16 x 24 with S~           1         1973        2        1
##  4 0904     Baseplate 16 x 24 with S~           1         1973        2        1
##  5 1        Homemaker Bookcase 2 x 4~           7          508       15        1
##  6 1        Homemaker Bookcase 2 x 4~           7         1158       15        2
##  7 1        Homemaker Bookcase 2 x 4~           7         6590       15        2
##  8 1        Homemaker Bookcase 2 x 4~           7         9679       15        2
##  9 1        Homemaker Bookcase 2 x 4~           7        12256        1        2
## 10 1        Homemaker Bookcase 2 x 4~           7        13356       15        1
## # ... with 258,948 more rows
# Combine the parts and inventory_parts tables
inventory_parts %>%
    inner_join(parts, by="part_num")
## # A tibble: 258,958 x 6
##    inventory_id part_num    color_id quantity name                   part_cat_id
##           <dbl> <chr>          <dbl>    <dbl> <chr>                        <dbl>
##  1           21 3009               7       50 Brick 1 x 6                     11
##  2           25 21019c00pa~       15        1 Legs and Hips with Bl~          61
##  3           25 24629pr0002       78        1 Minifig Head Special ~          59
##  4           25 24634pr0001        5        1 Headwear Accessory Bo~          27
##  5           25 24782pr0001        5        1 Minifig Hipwear Skirt~          27
##  6           25 88646              0        1 Tile Special 4 x 3 wi~          15
##  7           25 973pr3314c~        5        1 Torso with 1 White Bu~          60
##  8           26 14226c11           0        3 String with End Studs~          31
##  9           26 2340px2           15        1 Tail 4 x 1 x 3 with '~          35
## 10           26 2340px3           15        1 Tail 4 x 1 x 3 with '~          35
## # ... with 258,948 more rows
sets %>%
    # Add inventories using an inner join 
    inner_join(inventories, by="set_num") %>%
    # Add inventory_parts using an inner join 
    inner_join(inventory_parts, by=c("id"="inventory_id"))
## # A tibble: 258,958 x 9
##    set_num name           year theme_id    id version part_num color_id quantity
##    <chr>   <chr>         <dbl>    <dbl> <dbl>   <dbl> <chr>       <dbl>    <dbl>
##  1 700.3-1 Medium Gift ~  1949      365 24197       1 bdoor01         2        2
##  2 700.3-1 Medium Gift ~  1949      365 24197       1 bdoor01        15        1
##  3 700.3-1 Medium Gift ~  1949      365 24197       1 bdoor01         4        1
##  4 700.3-1 Medium Gift ~  1949      365 24197       1 bslot02        15        6
##  5 700.3-1 Medium Gift ~  1949      365 24197       1 bslot02         2        6
##  6 700.3-1 Medium Gift ~  1949      365 24197       1 bslot02         4        6
##  7 700.3-1 Medium Gift ~  1949      365 24197       1 bslot02         1        6
##  8 700.3-1 Medium Gift ~  1949      365 24197       1 bslot02        14        6
##  9 700.3-1 Medium Gift ~  1949      365 24197       1 bslot02a       15        6
## 10 700.3-1 Medium Gift ~  1949      365 24197       1 bslot02a        2        6
## # ... with 258,948 more rows
# Count the number of colors and sort
sets %>%
    inner_join(inventories, by = "set_num") %>%
    inner_join(inventory_parts, by = c("id" = "inventory_id")) %>%
    inner_join(colors, by = c("color_id" = "id"), suffix = c("_set", "_color")) %>%
    count(name_color, sort=TRUE)
## # A tibble: 134 x 2
##    name_color            n
##    <chr>             <int>
##  1 Black             48068
##  2 White             30105
##  3 Light Bluish Gray 26024
##  4 Red               21602
##  5 Dark Bluish Gray  19948
##  6 Yellow            17088
##  7 Blue              12980
##  8 Light Gray         8632
##  9 Reddish Brown      6960
## 10 Tan                6664
## # ... with 124 more rows

Chapter 2 - Left and Right Joins

The left_join verb:

  • Can join by two or more variables using by=c(…, …, …)
  • The left_join() will keep all the observations from the left dataset, bringing in matching data from the right dataset as and where it exists
    • Will introduce NA for the columns from the right dataset where there is an observation in the left dataset without any matches in the right dataset

The right_join verb:

  • The right_join() will keep all of the observations from he right (second) table and whatever matching information can be obtained from the left (first) table
  • Can replace NA values using replace_na()
    • replace_na(list(n=0)) # runs the replace only on the column ‘n’, and converts any NA to 0

Joining tables to themselves:

  • For hierarchical tables, joining the table to itself can better display the hierarchy for a given row
    • themes %>% inner_join(themes, by=c(“parent_id”=“id”), suffix=c("_child“,”_parent"))

Example code includes:

inventory_parts_joined <- inventory_parts %>%
    inner_join(inventories, by=c("inventory_id"="id")) %>%
    select(set_num, part_num, color_id, quantity)
str(inventory_parts_joined)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 258958 obs. of  4 variables:
##  $ set_num : chr  "3474-1" "71012-11" "71012-11" "71012-11" ...
##  $ part_num: chr  "3009" "21019c00pat004pr1033" "24629pr0002" "24634pr0001" ...
##  $ color_id: num  7 15 78 5 5 0 5 0 15 15 ...
##  $ quantity: num  50 1 1 1 1 1 1 3 1 1 ...
millennium_falcon <- inventory_parts_joined %>%
  filter(set_num == "7965-1")
str(millennium_falcon)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 263 obs. of  4 variables:
##  $ set_num : chr  "7965-1" "7965-1" "7965-1" "7965-1" ...
##  $ part_num: chr  "12825" "2412b" "2412b" "2419" ...
##  $ color_id: num  72 72 320 71 0 71 71 72 0 19 ...
##  $ quantity: num  3 20 2 1 4 1 7 2 1 2 ...
star_destroyer <- inventory_parts_joined %>%
  filter(set_num == "75190-1")
str(star_destroyer)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 293 obs. of  4 variables:
##  $ set_num : chr  "75190-1" "75190-1" "75190-1" "75190-1" ...
##  $ part_num: chr  "10247" "11203" "11212" "11212" ...
##  $ color_id: num  0 0 72 71 72 71 0 72 71 0 ...
##  $ quantity: num  12 6 1 6 8 1 8 1 2 1 ...
# Combine the star_destroyer and millennium_falcon tables
millennium_falcon %>%
    left_join(star_destroyer, by=c("part_num", "color_id"), suffix=c("_falcon", "_star_destroyer"))
## # A tibble: 263 x 6
##    set_num_falcon part_num color_id quantity_falcon set_num_star_de~
##    <chr>          <chr>       <dbl>           <dbl> <chr>           
##  1 7965-1         12825          72               3 <NA>            
##  2 7965-1         2412b          72              20 75190-1         
##  3 7965-1         2412b         320               2 <NA>            
##  4 7965-1         2419           71               1 <NA>            
##  5 7965-1         2420            0               4 75190-1         
##  6 7965-1         2420           71               1 <NA>            
##  7 7965-1         2420           71               7 <NA>            
##  8 7965-1         2431           72               2 <NA>            
##  9 7965-1         2431            0               1 75190-1         
## 10 7965-1         2431           19               2 <NA>            
## # ... with 253 more rows, and 1 more variable: quantity_star_destroyer <dbl>
# Aggregate Millennium Falcon for the total quantity in each part
millennium_falcon_colors <- millennium_falcon %>%
    group_by(color_id) %>%
    summarize(total_quantity = sum(quantity))

# Aggregate Star Destroyer for the total quantity in each part
star_destroyer_colors <- star_destroyer %>%
    group_by(color_id) %>%
    summarize(total_quantity = sum(quantity))

# Left join the Millennium Falcon colors to the Star Destroyer colors
millennium_falcon_colors %>%
    left_join(star_destroyer_colors, by="color_id", suffix=c("_falcon", "_star_destroyer"))
## # A tibble: 21 x 3
##    color_id total_quantity_falcon total_quantity_star_destroyer
##       <dbl>                 <dbl>                         <dbl>
##  1        0                   201                           336
##  2        1                    15                            23
##  3        4                    17                            53
##  4       14                     3                             4
##  5       15                    15                            17
##  6       19                    95                            12
##  7       28                     3                            16
##  8       33                     5                            NA
##  9       36                     1                            14
## 10       41                     6                            15
## # ... with 11 more rows
inventory_version_1 <- inventories %>%
    filter(version == 1)

# Join versions to sets
sets %>%
    left_join(inventory_version_1, by="set_num") %>%
    # Filter for where version is na
    filter(is.na(version))
## # A tibble: 1 x 6
##   set_num name       year theme_id    id version
##   <chr>   <chr>     <dbl>    <dbl> <dbl>   <dbl>
## 1 40198-1 Ludo game  2018      598    NA      NA
parts %>%
    count(part_cat_id) %>%
    right_join(part_categories, by = c("part_cat_id" = "id")) %>%
    # Filter for NA
    filter(is.na(n))
## # A tibble: 1 x 3
##   part_cat_id     n name   
##         <dbl> <int> <chr>  
## 1          66    NA Modulex
parts %>%
    count(part_cat_id) %>%
    right_join(part_categories, by = c("part_cat_id" = "id")) %>%
    # Use replace_na to replace missing values in the n column
    replace_na(list(n=0))
## # A tibble: 64 x 3
##    part_cat_id     n name                   
##          <dbl> <dbl> <chr>                  
##  1           1   135 Baseplates             
##  2           3   303 Bricks Sloped          
##  3           4  1900 Duplo, Quatro and Primo
##  4           5   107 Bricks Special         
##  5           6   128 Bricks Wedged          
##  6           7    97 Containers             
##  7           8    24 Technic Bricks         
##  8           9   167 Plates Special         
##  9          11   490 Bricks                 
## 10          12    85 Technic Connectors     
## # ... with 54 more rows
themes %>% 
    # Inner join the themes table
    inner_join(themes, by=c("id"="parent_id"), suffix=c("_parent", "_child")) %>%
    # Filter for the "Harry Potter" parent name 
    filter(name_parent=="Harry Potter")
## # A tibble: 6 x 5
##      id name_parent  parent_id id_child name_child          
##   <dbl> <chr>            <dbl>    <dbl> <chr>               
## 1   246 Harry Potter        NA      247 Chamber of Secrets  
## 2   246 Harry Potter        NA      248 Goblet of Fire      
## 3   246 Harry Potter        NA      249 Order of the Phoenix
## 4   246 Harry Potter        NA      250 Prisoner of Azkaban 
## 5   246 Harry Potter        NA      251 Sorcerer's Stone    
## 6   246 Harry Potter        NA      667 Fantastic Beasts
# Join themes to itself again to find the grandchild relationships
themes %>% 
    inner_join(themes, by = c("id" = "parent_id"), suffix = c("_parent", "_child")) %>%
    inner_join(themes, by = c("id_child" = "parent_id"), suffix = c("_parent", "_grandchild"))
## # A tibble: 158 x 7
##    id_parent name_parent parent_id id_child name_child id_grandchild name       
##        <dbl> <chr>           <dbl>    <dbl> <chr>              <dbl> <chr>      
##  1         1 Technic            NA        5 Model                  6 Airport    
##  2         1 Technic            NA        5 Model                  7 Constructi~
##  3         1 Technic            NA        5 Model                  8 Farm       
##  4         1 Technic            NA        5 Model                  9 Fire       
##  5         1 Technic            NA        5 Model                 10 Harbor     
##  6         1 Technic            NA        5 Model                 11 Off-Road   
##  7         1 Technic            NA        5 Model                 12 Race       
##  8         1 Technic            NA        5 Model                 13 Riding Cyc~
##  9         1 Technic            NA        5 Model                 14 Robot      
## 10         1 Technic            NA        5 Model                 15 Traffic    
## # ... with 148 more rows
themes %>% 
    # Left join the themes table to its own children
    left_join(themes, by=c("id"="parent_id"), suffix=c("_parent", "_child")) %>%
    # Filter for themes that have no child themes
    filter(is.na(id_child))
## # A tibble: 586 x 5
##       id name_parent    parent_id id_child name_child
##    <dbl> <chr>              <dbl>    <dbl> <chr>     
##  1     2 Arctic Technic         1       NA <NA>      
##  2     3 Competition            1       NA <NA>      
##  3     4 Expert Builder         1       NA <NA>      
##  4     6 Airport                5       NA <NA>      
##  5     7 Construction           5       NA <NA>      
##  6     8 Farm                   5       NA <NA>      
##  7     9 Fire                   5       NA <NA>      
##  8    10 Harbor                 5       NA <NA>      
##  9    11 Off-Road               5       NA <NA>      
## 10    12 Race                   5       NA <NA>      
## # ... with 576 more rows

Chapter 3 - Full, Semi, and Anti Joins

The full_join verb:

  • The full_join() will keep all records from both tables, matching them where possible and leaving NA for columns of non-matching records
  • Can use tidyr::replace_na() on multiple variables
    • replace_na(list(a=0, b=0))

The semi and anti-join verbs:

  • The full_join, left_join, right_join, and inner_join are all mutating verbs, which is to say that they change the number of columns
  • Filtering joins are different in that they keep only a subset of records, without mutating any of the columns
    • semi_join() keeps observations in x that can be found in y
    • anti_join() keeps observations in x that cannot be found in y

Visualizing set differences:

  • Can aggregate all of the sets to colors, summing the quantities by color for each set
  • Can then full join the color schemes together, for a single table of all the colors used

Example code includes:

inventory_parts_joined <- inventories %>%
    inner_join(inventory_parts, by = c("id" = "inventory_id")) %>%
    arrange(desc(quantity)) %>%
    select(-id, -version)
str(inventory_parts_joined)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 258958 obs. of  4 variables:
##  $ set_num : chr  "40179-1" "40179-1" "40179-1" "40179-1" ...
##  $ part_num: chr  "3024" "3024" "3024" "3024" ...
##  $ color_id: num  72 15 0 71 14 15 320 0 0 0 ...
##  $ quantity: num  900 900 900 900 900 810 771 720 684 540 ...
inventory_parts_joined %>%
    # Combine the sets table with inventory_parts_joined 
    inner_join(sets, by=c("set_num"="set_num")) %>%
    # Combine the themes table with your first join 
    inner_join(themes, by=c("theme_id"="id"), suffix=c("_set", "_theme"))
## # A tibble: 258,958 x 9
##    set_num part_num color_id quantity name_set  year theme_id name_theme
##    <chr>   <chr>       <dbl>    <dbl> <chr>    <dbl>    <dbl> <chr>     
##  1 40179-1 3024           72      900 Persona~  2016      277 Mosaic    
##  2 40179-1 3024           15      900 Persona~  2016      277 Mosaic    
##  3 40179-1 3024            0      900 Persona~  2016      277 Mosaic    
##  4 40179-1 3024           71      900 Persona~  2016      277 Mosaic    
##  5 40179-1 3024           14      900 Persona~  2016      277 Mosaic    
##  6 k34434~ 3024           15      810 Lego Mo~  2003      277 Mosaic    
##  7 21010-1 3023          320      771 Robie H~  2011      252 Architect~
##  8 k34431~ 3024            0      720 Lego Mo~  2003      277 Mosaic    
##  9 42083-1 2780            0      684 Bugatti~  2018        5 Model     
## 10 k34434~ 3024            0      540 Lego Mo~  2003      277 Mosaic    
## # ... with 258,948 more rows, and 1 more variable: parent_id <dbl>
inventory_sets_themes <- inventory_parts_joined %>%
    inner_join(sets, by = "set_num") %>%
    inner_join(themes, by = c("theme_id" = "id"), suffix = c("_set", "_theme"))
str(inventory_sets_themes)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 258958 obs. of  9 variables:
##  $ set_num   : chr  "40179-1" "40179-1" "40179-1" "40179-1" ...
##  $ part_num  : chr  "3024" "3024" "3024" "3024" ...
##  $ color_id  : num  72 15 0 71 14 15 320 0 0 0 ...
##  $ quantity  : num  900 900 900 900 900 810 771 720 684 540 ...
##  $ name_set  : chr  "Personalised Mosaic Portrait" "Personalised Mosaic Portrait" "Personalised Mosaic Portrait" "Personalised Mosaic Portrait" ...
##  $ year      : num  2016 2016 2016 2016 2016 ...
##  $ theme_id  : num  277 277 277 277 277 277 252 277 5 277 ...
##  $ name_theme: chr  "Mosaic" "Mosaic" "Mosaic" "Mosaic" ...
##  $ parent_id : num  276 276 276 276 276 276 NA 276 1 276 ...
batman <- inventory_sets_themes %>%
    filter(name_theme == "Batman")
str(batman)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 3783 obs. of  9 variables:
##  $ set_num   : chr  "7787-1" "70904-1" "70904-1" "77903-1" ...
##  $ part_num  : chr  "3873" "6141" "4032a" "3023" ...
##  $ color_id  : num  0 84 84 46 0 84 179 0 72 34 ...
##  $ quantity  : num  158 81 67 46 44 41 31 28 28 26 ...
##  $ name_set  : chr  "The Bat-Tank: The Riddler and Bane's Hideout" "Clayface Splat Attack" "Clayface Splat Attack" "The Dark Knight of Gotham City" ...
##  $ year      : num  2007 2017 2017 2019 2007 ...
##  $ theme_id  : num  484 484 484 484 484 484 484 484 484 484 ...
##  $ name_theme: chr  "Batman" "Batman" "Batman" "Batman" ...
##  $ parent_id : num  482 482 482 482 482 482 482 482 482 482 ...
star_wars <- inventory_sets_themes %>%
    filter(name_theme == "Star Wars")
str(star_wars)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 5402 obs. of  9 variables:
##  $ set_num   : chr  "7194-1" "7194-1" "7194-1" "75244-1" ...
##  $ part_num  : chr  "2357" "3001" "2420" "2780" ...
##  $ color_id  : num  19 19 378 0 19 19 19 0 0 15 ...
##  $ quantity  : num  84 73 72 64 57 55 54 54 52 49 ...
##  $ name_set  : chr  "Yoda" "Yoda" "Yoda" "Tantive IV" ...
##  $ year      : num  2002 2002 2002 2019 2002 ...
##  $ theme_id  : num  158 158 158 158 158 158 158 158 158 158 ...
##  $ name_theme: chr  "Star Wars" "Star Wars" "Star Wars" "Star Wars" ...
##  $ parent_id : num  NA NA NA NA NA NA NA NA NA NA ...
# Count the part number and color id, weight by quantity
(batman_parts <- batman %>%
    count(part_num, color_id, wt=quantity))
## # A tibble: 2,071 x 3
##    part_num color_id     n
##    <chr>       <dbl> <dbl>
##  1 10113           0    11
##  2 10113         272     1
##  3 10113         320     1
##  4 10183          57     1
##  5 10190           0     2
##  6 10201           0     1
##  7 10201           4     3
##  8 10201          14     1
##  9 10201          15     6
## 10 10201          71     4
## # ... with 2,061 more rows
(star_wars_parts <- star_wars %>%
    count(part_num, color_id, wt=quantity))
## # A tibble: 2,413 x 3
##    part_num color_id     n
##    <chr>       <dbl> <dbl>
##  1 10169           4     1
##  2 10197           0     2
##  3 10197          72     3
##  4 10201           0    21
##  5 10201          71     5
##  6 10247           0     9
##  7 10247          71    16
##  8 10247          72    12
##  9 10884          28     1
## 10 10928          72     6
## # ... with 2,403 more rows
(parts_joined <- batman_parts %>%
    # Combine the star_wars_parts table 
    full_join(star_wars_parts, by=c("part_num", "color_id"), suffix=c("_batman", "_star_wars")) %>%
    # Replace NAs with 0s in the n_batman and n_star_wars columns 
    replace_na(list(n_batman=0, n_star_wars=0)))
## # A tibble: 3,628 x 4
##    part_num color_id n_batman n_star_wars
##    <chr>       <dbl>    <dbl>       <dbl>
##  1 10113           0       11           0
##  2 10113         272        1           0
##  3 10113         320        1           0
##  4 10183          57        1           0
##  5 10190           0        2           0
##  6 10201           0        1          21
##  7 10201           4        3           0
##  8 10201          14        1           0
##  9 10201          15        6           0
## 10 10201          71        4           5
## # ... with 3,618 more rows
parts_joined %>%
    # Sort the number of star wars pieces in descending order 
    arrange(-n_star_wars) %>%
    # Join the colors table to the parts_joined table
    left_join(colors, by=c("color_id"="id")) %>%
    # Join the parts table to the previous join 
    left_join(parts, by=c("part_num"), suffix=c("_color", "_part"))
## # A tibble: 3,628 x 8
##    part_num color_id n_batman n_star_wars name_color rgb   name_part part_cat_id
##    <chr>       <dbl>    <dbl>       <dbl> <chr>      <chr> <chr>           <dbl>
##  1 2780            0      104         392 Black      #051~ Technic ~          53
##  2 32062           0        1         141 Black      #051~ Technic ~          46
##  3 4274            1       56         118 Blue       #005~ Technic ~          53
##  4 6141           36       11         117 Trans-Red  #C91~ Plate Ro~          21
##  5 3023           71       10         106 Light Blu~ #A0A~ Plate 1 ~          14
##  6 6558            1       30         106 Blue       #005~ Technic ~          53
##  7 43093           1       44          99 Blue       #005~ Technic ~          53
##  8 3022           72       14          95 Dark Blui~ #6C6~ Plate 2 ~          14
##  9 2357           19        0          84 Tan        #E4C~ Brick 2 ~          11
## 10 6141          179       90          81 Flat Silv~ #898~ Plate Ro~          21
## # ... with 3,618 more rows
batmobile <- inventory_parts_joined %>%
    filter(set_num == "7784-1") %>%
    select(-set_num)
str(batmobile)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 173 obs. of  3 variables:
##  $ part_num: chr  "3023" "2780" "50950" "3004" ...
##  $ color_id: num  72 0 0 71 1 0 0 0 14 0 ...
##  $ quantity: num  62 28 28 26 25 23 21 21 19 18 ...
batwing <- inventory_parts_joined %>%
    filter(set_num == "70916-1") %>%
    select(-set_num)
str(batwing)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 309 obs. of  3 variables:
##  $ part_num: chr  "3023" "3024" "3623" "11477" ...
##  $ color_id: num  0 0 0 0 71 0 0 0 0 0 ...
##  $ quantity: num  22 22 20 18 18 17 16 14 14 13 ...
# Filter the batwing set for parts that are also in the batmobile set
batwing %>%
    semi_join(batmobile, by=c("part_num"))
## # A tibble: 126 x 3
##    part_num color_id quantity
##    <chr>       <dbl>    <dbl>
##  1 3023            0       22
##  2 3024            0       22
##  3 3623            0       20
##  4 2780            0       17
##  5 3666            0       16
##  6 3710            0       14
##  7 6141            4       12
##  8 2412b          71       10
##  9 6141           72       10
## 10 6558            1        9
## # ... with 116 more rows
# Filter the batwing set for parts that aren't in the batmobile set
batwing %>%
    anti_join(batmobile, by=c("part_num"))
## # A tibble: 183 x 3
##    part_num color_id quantity
##    <chr>       <dbl>    <dbl>
##  1 11477           0       18
##  2 99207          71       18
##  3 22385           0       14
##  4 99563           0       13
##  5 10247          72       12
##  6 2877           72       12
##  7 61409          72       12
##  8 11153           0       10
##  9 98138          46       10
## 10 2419           72        9
## # ... with 173 more rows
# Use inventory_parts to find colors included in at least one set
colors %>%
    semi_join(inventory_parts, by=c("id"="color_id"))
## # A tibble: 134 x 3
##       id name           rgb    
##    <dbl> <chr>          <chr>  
##  1    -1 [Unknown]      #0033B2
##  2     0 Black          #05131D
##  3     1 Blue           #0055BF
##  4     2 Green          #237841
##  5     3 Dark Turquoise #008F9B
##  6     4 Red            #C91A09
##  7     5 Dark Pink      #C870A0
##  8     6 Brown          #583927
##  9     7 Light Gray     #9BA19D
## 10     8 Dark Gray      #6D6E5C
## # ... with 124 more rows
# Use filter() to extract version 1 
version_1_inventories <- inventories %>%
    filter(version==1)

# Use anti_join() to find which set is missing a version 1
sets %>%
    anti_join(version_1_inventories, by=c("set_num"))
## # A tibble: 1 x 4
##   set_num name       year theme_id
##   <chr>   <chr>     <dbl>    <dbl>
## 1 40198-1 Ludo game  2018      598
(inventory_parts_themes <- inventories %>%
    inner_join(inventory_parts, by = c("id" = "inventory_id")) %>%
    arrange(desc(quantity)) %>%
    select(-id, -version) %>%
    inner_join(sets, by = "set_num") %>%
    inner_join(themes, by = c("theme_id" = "id"), suffix = c("_set", "_theme")))
## # A tibble: 258,958 x 9
##    set_num part_num color_id quantity name_set  year theme_id name_theme
##    <chr>   <chr>       <dbl>    <dbl> <chr>    <dbl>    <dbl> <chr>     
##  1 40179-1 3024           72      900 Persona~  2016      277 Mosaic    
##  2 40179-1 3024           15      900 Persona~  2016      277 Mosaic    
##  3 40179-1 3024            0      900 Persona~  2016      277 Mosaic    
##  4 40179-1 3024           71      900 Persona~  2016      277 Mosaic    
##  5 40179-1 3024           14      900 Persona~  2016      277 Mosaic    
##  6 k34434~ 3024           15      810 Lego Mo~  2003      277 Mosaic    
##  7 21010-1 3023          320      771 Robie H~  2011      252 Architect~
##  8 k34431~ 3024            0      720 Lego Mo~  2003      277 Mosaic    
##  9 42083-1 2780            0      684 Bugatti~  2018        5 Model     
## 10 k34434~ 3024            0      540 Lego Mo~  2003      277 Mosaic    
## # ... with 258,948 more rows, and 1 more variable: parent_id <dbl>
batman_colors <- inventory_parts_themes %>%
    # Filter the inventory_parts_themes table for the Batman theme
    filter(name_theme=="Batman") %>%
    group_by(color_id) %>%
    summarize(total = sum(quantity)) %>%
    # Add a percent column of the total divided by the sum of the total 
    mutate(percent=total/sum(total))

# Filter and aggregate the Star Wars set data; add a percent column
star_wars_colors <- inventory_parts_themes %>%
    filter(name_theme=="Star Wars") %>%
    group_by(color_id) %>%
    summarize(total = sum(quantity)) %>%
    mutate(percent=total/sum(total))


(colors_joined <- batman_colors %>%
    full_join(star_wars_colors, by = "color_id", suffix = c("_batman", "_star_wars")) %>%
    replace_na(list(total_batman = 0, total_star_wars = 0, percent_batman=0, percent_star_wars=0)) %>%
    inner_join(colors, by = c("color_id" = "id")) %>%
    # Create the difference and total columns
    mutate(difference = percent_batman - percent_star_wars, total = total_batman + total_star_wars) %>%
    # Filter for totals greater than 200
    filter(total >= 200))
## # A tibble: 16 x 9
##    color_id total_batman percent_batman total_star_wars percent_star_wa~ name 
##       <dbl>        <dbl>          <dbl>           <dbl>            <dbl> <chr>
##  1        0         2807        0.296              3258          0.207   Black
##  2        1          243        0.0256              410          0.0261  Blue 
##  3        4          529        0.0558              434          0.0276  Red  
##  4       14          426        0.0449              207          0.0132  Yell~
##  5       15          404        0.0426             1771          0.113   White
##  6       19          142        0.0150             1012          0.0644  Tan  
##  7       28           98        0.0103              183          0.0116  Dark~
##  8       36           86        0.00907             246          0.0156  Tran~
##  9       46          200        0.0211               39          0.00248 Tran~
## 10       70          297        0.0313              373          0.0237  Redd~
## 11       71         1148        0.121              3264          0.208   Ligh~
## 12       72         1453        0.153              2433          0.155   Dark~
## 13       84          278        0.0293               31          0.00197 Medi~
## 14      179          154        0.0162              232          0.0148  Flat~
## 15      378           22        0.00232             430          0.0273  Sand~
## 16        7            0        0                   209          0.0133  Ligh~
## # ... with 3 more variables: rgb <chr>, difference <dbl>, total <dbl>
color_palette <- c('#05131D', '#0055BF', '#C91A09', '#F2CD37', '#FFFFFF', '#E4CD9E', '#958A73', '#C91A09', '#F5CD2F', '#582A12', '#A0A5A9', '#6C6E68', '#CC702A', '#898788', '#A0BCAC', '#D3D3D3')
names(color_palette) <- c('Black', 'Blue', 'Red', 'Yellow', 'White', 'Tan', 'Dark Tan', 'Trans-Red', 'Trans-Yellow', 'Reddish Brown', 'Light Bluish Gray', 'Dark Bluish Gray', 'Medium Dark Flesh', 'Flat Silver', 'Sand Green', 'Light Gray')
color_palette
##             Black              Blue               Red            Yellow 
##         "#05131D"         "#0055BF"         "#C91A09"         "#F2CD37" 
##             White               Tan          Dark Tan         Trans-Red 
##         "#FFFFFF"         "#E4CD9E"         "#958A73"         "#C91A09" 
##      Trans-Yellow     Reddish Brown Light Bluish Gray  Dark Bluish Gray 
##         "#F5CD2F"         "#582A12"         "#A0A5A9"         "#6C6E68" 
## Medium Dark Flesh       Flat Silver        Sand Green        Light Gray 
##         "#CC702A"         "#898788"         "#A0BCAC"         "#D3D3D3"
# Create a bar plot using colors_joined and the name and difference columns
ggplot(colors_joined, aes(x=reorder(name, difference), y=difference, fill = name)) +
    geom_col() +
    coord_flip() +
    scale_fill_manual(values = color_palette, guide = FALSE) +
    labs(y = "Difference: Batman - Star Wars")


Chapter 4 - Case Study: Stack Overflow

Stack Overflow Questions:

  • The ‘questions’ table contains id-creation_date-score (unique by id)
  • The ‘questions_tag’ table contains question_id-tag_id (many-to-many)
  • The ‘tags’ table contains id-tag_name (unique by id?)

Joining Questions and Answers:

  • The ‘answers’ table contains id-creation_date-question_id-score (one-to-many)
    • The ‘answers’ table contains id-creation_date-question_id-score (one-to-many)

The bind_rows verb:

  • Can use bind_rows to stack data on top of each other in to a single frame
    • questions %>% bind_rows(answers)
    • Can mutate a type in to each of the raw tables prior to the bind_rows so that the source is known for future reference or analysis
  • Can use lubridate::year(myDate) to get the year from a date object

Wrap up:

  • Joining verbs
    • Mutating joins - inner, left, right, full
    • Filtering joins - semi, anti
  • Stacking using bind_rows

Example code includes:

questions <- readRDS("./RInputFiles/questions.rds")
tags <- readRDS("./RInputFiles/tags.rds")
question_tags <- readRDS("./RInputFiles/question_tags.rds")
answers <- readRDS("./RInputFiles/answers.rds")


# Replace the NAs in the tag_name column
questions_with_tags <- questions %>%
    left_join(question_tags, by = c("id" = "question_id")) %>%
    left_join(tags, by = c("tag_id" = "id")) %>%
    replace_na(list(tag_name="only-r"))


questions_with_tags %>%
    # Group by tag_name
    group_by(tag_name) %>%
    # Get mean score and num_questions
    summarize(score = mean(score), num_questions = n()) %>%
    # Sort num_questions in descending order
    arrange(-num_questions)
## # A tibble: 7,841 x 3
##    tag_name   score num_questions
##    <chr>      <dbl>         <int>
##  1 only-r     1.26          48541
##  2 ggplot2    2.61          28228
##  3 dataframe  2.31          18874
##  4 shiny      1.45          14219
##  5 dplyr      1.95          14039
##  6 plot       2.24          11315
##  7 data.table 2.97           8809
##  8 matrix     1.66           6205
##  9 loops      0.743          5149
## 10 regex      2              4912
## # ... with 7,831 more rows
# Using a join, filter for tags that are never on an R question
tags %>%
    anti_join(question_tags, by=c("id"="tag_id"))
## # A tibble: 40,459 x 2
##        id tag_name                 
##     <dbl> <chr>                    
##  1 124399 laravel-dusk             
##  2 124402 spring-cloud-vault-config
##  3 124404 spring-vault             
##  4 124405 apache-bahir             
##  5 124407 astc                     
##  6 124408 simulacrum               
##  7 124410 angulartics2             
##  8 124411 django-rest-viewsets     
##  9 124414 react-native-lightbox    
## 10 124417 java-module              
## # ... with 40,449 more rows
questions %>%
    # Inner join questions and answers with proper suffixes
    inner_join(answers, by=c("id"="question_id"), suffix=c("_question", "_answer")) %>%
    # Subtract creation_date_question from creation_date_answer to create gap
    mutate(gap = as.integer(creation_date_answer-creation_date_question))
## # A tibble: 380,643 x 7
##        id creation_date_q~ score_question id_answer creation_date_a~
##     <int> <date>                    <int>     <int> <date>          
##  1 2.26e7 2014-03-21                    1  22560670 2014-03-21      
##  2 2.26e7 2014-03-21                    2  22558516 2014-03-21      
##  3 2.26e7 2014-03-21                    2  22558726 2014-03-21      
##  4 2.26e7 2014-03-21                    2  22558085 2014-03-21      
##  5 2.26e7 2014-03-21                    2  22606545 2014-03-24      
##  6 2.26e7 2014-03-21                    2  22610396 2014-03-24      
##  7 2.26e7 2014-03-21                    2  34374729 2015-12-19      
##  8 2.26e7 2014-03-21                    2  22559327 2014-03-21      
##  9 2.26e7 2014-03-21                    2  22560102 2014-03-21      
## 10 2.26e7 2014-03-21                    2  22560288 2014-03-21      
## # ... with 380,633 more rows, and 2 more variables: score_answer <int>,
## #   gap <int>
# Count and sort the question id column in the answers table
answer_counts <- answers %>%
    count(question_id, sort=TRUE)

# Combine the answer_counts and questions tables
question_answer_counts <- questions %>%
    left_join(answer_counts, by=c("id"="question_id")) %>%
    # Replace the NAs in the n column
    replace_na(list(n=0))


tagged_answers <- question_answer_counts %>%
    # Join the question_tags tables
    inner_join(question_tags, by=c("id"="question_id")) %>%
    # Join the tags table
    inner_join(tags, by=c("tag_id"="id"))


tagged_answers %>%
    # Aggregate by tag_name
    group_by(tag_name) %>%
    # Summarize questions and average_answers
    summarize(questions = n(), average_answers = mean(n)) %>%
    # Sort the questions in descending order
    arrange(-questions)
## # A tibble: 7,840 x 3
##    tag_name   questions average_answers
##    <chr>          <int>           <dbl>
##  1 ggplot2        28228           1.15 
##  2 dataframe      18874           1.67 
##  3 shiny          14219           0.921
##  4 dplyr          14039           1.55 
##  5 plot           11315           1.23 
##  6 data.table      8809           1.47 
##  7 matrix          6205           1.45 
##  8 loops           5149           1.39 
##  9 regex           4912           1.91 
## 10 function        4892           1.30 
## # ... with 7,830 more rows
# Inner join the question_tags and tags tables with the questions table
questions_with_tags <- questions %>%
    inner_join(question_tags, by = c("id"="question_id")) %>%
    inner_join(tags, by = c("tag_id"="id"))

# Inner join the question_tags and tags tables with the answers table
answers_with_tags <- answers %>%
    inner_join(question_tags, by = c("question_id"="question_id")) %>%
    inner_join(tags, by = c("tag_id"="id"))


# Combine the two tables into posts_with_tags
posts_with_tags <- bind_rows(questions_with_tags %>% mutate(type = "question"), answers_with_tags %>% mutate(type = "answer"))

# Add a year column, then aggregate by type, year, and tag_name
by_type_year_tag <- posts_with_tags %>%
    mutate(year=lubridate::year(creation_date)) %>%
    count(type, year, tag_name)


# Filter for the dplyr and ggplot2 tag names 
by_type_year_tag_filtered <- by_type_year_tag %>%
    filter(tag_name %in% c("dplyr", "ggplot2"))

# Create a line plot faceted by the tag name 
ggplot(by_type_year_tag_filtered, aes(x=year, y=n, color = type)) +
    geom_line() +
    facet_wrap(~ tag_name)


Introduction to TensorFlow in R

Chapter 1 - Introducing TensorFlow in R

What is TensorFlow?

  • TensorFlow was created by Google Brain - open source library with Python/R as a front-end API
    • C++ application execution, though this is largely hidden from the user
    • Particularly popular for image classification, NLP, RNN, etc.
  • Need to install TensorFlow on a computer before installing the “tensorflow” library
    • library(tensorflow)
    • firstsession = tf$Session()
    • print(firstsession$run())
    • firstsession$close()

TensorFlow Syntax, Variables, and Placeholders:

  • Constants create nodes that have non-changing values throughout the session
    • tf$constant() # value, dtype (will default to float for all numbers if not specified), shape=None
  • Variables may change over the course of the session
    • tf$Variable(‘initial value’, ‘optional name’)
    • EmptyMatrix <- tf\(Variable(tf\)zeros(shape(4, 3)))
  • Placeholders can be created for use later
    • tf$placeholder(dtype, shape=None, name=None)
    • SinglePlaceholder <- tf\(placeholder(tf\)float32)

TensorBoard - Visualizing TensorFlow Models:

  • TensorBoard allows for visualizing the TensorFlow models
    • Browser-based and will open locally
    • session = tf$Session()
    • a <- tf$constant(5, name=“NumAdults”)
    • b <- tf$constant(6, name=“NumChildren”)
    • d <- tf$add(a, b)
    • session$run(d)
    • writemygraph <- tf\(summary\)FileWriter(‘./graphs’, session$graph)
    • tensorboard(log_dir = ‘./graphs’)

Example code includes:

# Miniconda has been successfully installed at "C:/.../AppData/Local/r-miniconda".
# Need to install and PATH tensorflow for this to work

library(tensorflow)


# Create your session
sess <- tf$Session()

# Define a constant (you'll learn this next!)
HiThere <- tf$constant('Hi DataCamp Student!')

# Run your session with the HiThere constant
print(sess$run(HiThere))

# Close the session
sess$close()


# Create two constant tensors
myfirstconstanttensor <- tf$constant(152)
mysecondconstanttensor <- tf$constant('I am a tensor master!')

# Create a matrix of zeros
myfirstvariabletensor <- tf$Variable(tf$zeros(shape(5, 1)))


# Set up your session
EmployeeSession <- tf$Session()

# Add your constants
female <- tf$constant(150, name = "FemaleEmployees")
male <- tf$constant(135, name = "MaleEmployees")
total <- tf$add(female, male)
print(EmployeeSession$run(total))

# Write to file
towrite <- tf$summary$FileWriter('./graphs', EmployeeSession$graph)

# Open Tensorboard
tensorboard(log_dir = './graphs')


# From last exercise
total <- tf$add(female,male)

# Multiply your allemps by growth projections
growth <- tf$constant(1.32, name = "EmpGrowth")
EmpGrowth <- tf$math$multiply(total, growth)
print(EmployeeSession$run(EmpGrowth))

# Write to file
towrite <- tf$summary$FileWriter('./graphs', EmployeeSession$graph)

# Open Tensorboard
tensorboard(log_dir = './graphs')


# Start Session
sess <- tf$Session()

# Create 2 constants
a <- tf$constant(10)
b <- tf$constant(32)

# Add your two constants together
sess$run(a + b)

# Create a Variable
mytestvariable <- tf$Variable(tf$zeros(shape(1L)))

# Run the last line
mytestvariable

Chapter 2 - Linear Regression Using Two TensorFlow API

Core API: Linear Regression:

  • The Core API is a low-level API that allows for full control
  • The Keras API is a higher-level interface that is higher-level and allows for runnng neural networks
  • The Estimators API is the highest-level interface that has canned models available for running
  • Can use the Core API for running linear regression models for y ~ x
    • x_actual <- beer_train$precip
    • y_actual <- beer_train$beer_consumed
    • w <- tf\(Variable(tf\)random_uniform(shape(1L), -1, 1)) # min is -1, max is 1
    • b <- tf\(Variable(tf\)zeros(shape(1L)))
    • y_predict <- w * x_data + b

Core API: Linear Regression Part II:

  • Cost functions are a measure of the loss (error) in the model - frequently by comparing predictions to actuals
    • loss <- tf$reduce_mean((y_predict - y_actual)**2)
  • There are many optimizers available, with Gradient Descent being a common choice
    • optimizer <- tf\(train\)GradientDescentOptimizer(0.001)
    • train <- optimizer$minimize(loss)
    • sess <- tf$Session()
    • sess\(run(tf\)global_variables_initializer())

Core API: Linear Regression Part III:

  • Can run model for a specified number of epochs
    • for (step in 1:2000) {
    • sess$run(train)  
    • if (step %% 500 == 0) cat("Step = ", step, "Estimate w = ", sess$run(w), "Estimate b = ", sess$run(b))  
    • }

Estimators API: Multiple Linear Regression:

  • Begin by defining feature columns for the Estimators API
    • ftr_colns <- feature_columns(tf\(feature_column\)numeric_column(“numericcolumnname”), tf\(feature_column\)categorical_column_with_identity(“categoricalcolumnname”, NumCategories))
  • Can use any of 6 canned models in the Estimators API, including linear regressor
    • modelName <- linear_regressor(feature_columns = ftr_colns)
    • functionName <- function(data) { input_fn(data, features=c(“feature1”, “feature2”, …), response = “responsevariable”) }
    • train(modelName, functionName(trainingData))
    • modeleval <- evaluate(modelName, functionName(trainingData))
    • modeleval

Example code includes:

# Parse out the minimum study time and final percent in x_data and y_data variables
x_data <- studentgradeprediction_train$minstudytime
y_data <- studentgradeprediction_train$Finalpercent


# Define your w variable
w <- tf$Variable(tf$random_uniform(shape(1L), -1.0, 1.0))

# Define your b variable
b <- tf$Variable(tf$zeros(shape(1L)))

# Define your linear equation
y <- w * x_data + b


# Define cost function
loss <- tf$reduce_mean((y-y_data)^2)

# Use the Gradient Descent Optimizer
optimizer <- tf$train$GradientDescentOptimizer(0.0001)

# Minimize MSE loss
train <- optimizer$minimize(loss)


# Launch new session
Finalgradessession <- tf$Session()

# Initialize (run) global variables
Finalgradessession$run(tf$global_variables_initializer())


# Train your model
for (step in 1:3750) {
    Finalgradessession$run(train)
    if (step %% 750 == 0) cat("Step = ", step, "Estimate w = ", Finalgradessession$run(w), "Estimate b =", Finalgradessession$run(b), "\n")
}


# Calculate the predicted grades
grades_actual <- studentgradeprediction_test$Finalpercent
grades_predicted <- as.vector(Finalgradessession$run(w)) * 
                    studentgradeprediction_test$minstudytime +
                    as.vector(Finalgradessession$run(b))

# Plot the actual and predicted grades
plot(grades_actual, grades_predicted, pch=19, col='red')

# Run a correlation 
cor(grades_actual, grades_predicted)


# Define all four of your feature columns
ftr_colns <- feature_columns(

  
  
  
)


# Choose the correct model
grademodel <- linear_regressor(feature_columns = ftr_colns)

# Define your input function
grade_input_fn <- function(data){
  
}


# Train your model
train(grademodel, grade_input_fn(train))

# Evaluate your model
model_eval <- evaluate(grademodel, grade_input_fn(test))

# See the results
model_eval


# Calculate the predictions
predictoutput <- predict(grademodel, input_fn=grademodel_input_fn(studentgradeprediction_test))

# Plot actual and predicted values
plot(studentgradeprediction_test$Finalpercent, as.numeric(predictoutput$predictions), 
     xlab = "actual_grades", ylab = "predicted_grades", pch=19, col='red'
     )

# Calculate the correlation
cor(as.numeric(predictoutput$predictions), studentgradeprediction_test$Finalpercent)

Chapter 3 - Deep Learning in TensorFlow: Creating a Deep Neural Network

Gentle Introduction to Neural Networks:

  • Neural networks include input layers, output layers, and hidden layers
    • Series of multiplications, additions, and activation functions determine the value at each node
  • ReLU (rectified linear unit) is one of the more common activation functions
    • Converts any negative value to 0 and returns any non-negative value as-is

Deep Neural Networks Using Keras API:

  • Example of using Keras API to predict whether a specific bill is genuine or counterfeit
    • Define - Compile - Fit - Evaluate - Predict
    • model = keras_model_sequential()
    • model %>% layer_dense(units=15, activation=‘relu’, input_shape=ncol(train_x)) %>% layer_dense(units=5, activation=‘relu’) %>% layer_dense(units=1)
    • model %>% compile(optimizer=‘rmsprop’, loss=‘mse’, metrics=c(‘accuracy’, ‘mae’))
    • model %>% fit(x=x_train, y=y_train, epochs=25, batch_size=32, validation_split=0.2)

Evaluate, Predict, Visualize Model:

  • Can evaluate the model using the evaluate() command
    • score = model %>% evaluate(test_x, test_y)
    • model %>% predict_classes(test_x)
  • Visualizations occur automatically during the fit() call, or can be called using TensorBoard
    • model %>% fit(x=train_x, y=train_y, epochs=25, validation_split=0.25, callbacks=callback_tensorboard(“logs/run_1”))
    • tensorboard(“logs/run_1”)

Create DNN Using Estimators API:

  • There is a size-step process for using Estimators to run a DNN
    • Split in to test and train data
    • feature_columns <- feature_columns(numeric_column(“colNum1”), column_categorical_with_vocabulary_list(“colVocab1”, ’vocabList"), …)
    • classifier <- dnn_classifier(feature_columns=feature_columns, hidden_units=c(5, 10, 5), n_classes=2)
    • data_input_function <- function(data) { input_fn(data, features=feature_columns, response=“nameofresponsevariable”) }
    • train(classifier, input_fn = inputfunctionname(trainingdata_name))
    • eval <- evaluate(classifier, input_fn=inputfunctionname(testingdata_name))
    • preds <- predict(classifier, input_fn=inputfunctionname(testingdata_name))

Example code includes:

# Define the model
model <-  keras_model_sequential()
model %>%
    layer_dense(units=15, activation = 'relu', input_shape = 8) %>%
    layer_dense(units=5, activation = 'relu') %>%
    layer_dense(units=1)

# Compile the model
model %>%
    compile(optimizer = 'rmsprop', loss = 'mse', metrics = c('accuracy'))


# Fit the model
model %>%
    fit(x = train_x, y = train_y, epochs = 25, batch_size=32, validation_split = .2)


# Evaluate the model
score <- model %>%
    evaluate(test_x, test_y)

# Call up the accuracy 
score$acc


# Predict based on your model
predictedclasses <- model %>%
    predict_classes(newdata_x)

# Print predicted classes with customers' names
rownames(predictedclasses) <- c('Jasmit', 'Banjeet')
predictedclasses


# Fit the model and define callbacks
model %>%
    fit(x = train_x, y = train_y,epochs = 25, batch_size = 32, validation_split = .2, 
        callbacks = callback_tensorboard("logs/run_1")
        )

# Call TensorBoard
tensorboard("logs/run_1")


# Train the model
train(dnnclassifier, input_fn = shopping_input_function(shopper_train))

# Evaluate the model by correcting the error
evaluate(dnnclassifier, input_fn = shopping_input_function(shopper_test))


# Create a sequential model and the network architecture
ourdnnmodel <- keras_model_sequential() %>%
    layer_dense(units = 10, activation = "relu", input_shape = ncol(train_x)) %>%
    layer_dense(units = 5, activation = "relu") %>%
    layer_dense(units = 1) %>%
    compile(optimizer = 'rmsprop', loss = 'mse', metrics = c("mae", "accuracy"))

# Fit your model
learn <- ourdnnmodel %>% 
    fit(x = train_x, y = train_y, epochs = 25, batch_size = 32, validation_split = 0.2, verbose = FALSE)

# Run the learn function
learn

Chapter 4 - Deep Learning in TensorFlow: Increasing Model Accuracy

L2 Regularization Using Keras:

  • Regularization is an attempt to minimize over-fitting by penalizing use of too many coefficients in training
  • L2 Regularization (Ridge) introduces a complexity penalty to the least-squares approach
    • model <- keras_model_sequential()
    • model %>% layer_dense(units=15, activation=“relu”, input_shape=8, kernel_regularizer=regularizer_l2(l=0.001))

Dropout Technique Using TFEstimators:

  • Dropout is a common form of regularization that prevents over-fitting
    • Some of the hidden nodes are temporarily hidden (dropped out)
    • The input and output layers remain unchanged in this technique
  • Example of using a dnn_classifier with dropout regularization
    • ourmodel <- dnn_classifier(hidden_units=6, feature_columns=ftr_colns, dropout=0.5) # this is a 50% – 0.5 – dropout model

Hyperparameter Tuning with tfruns:

  • Hyperparameters for the neural network can be tuned for optimized performance
  • Best practices are to store the code as a training script, identify flags for iterable parameters
    • runs <- training_run(“mycode.R”, flags=list(dropout=c(0.2, 0.3, 0.4, 0.5), activation=c(“relu”, “softmax”)))

Wrap Up:

  • Introduction to TensorFlow - syntax and core concepts
  • Learning the Basics - core API an estimators
  • Deep Learning in TensorFlow
  • Model Regularization

Example code includes:

# Define the model
model_lesson1 <- keras_model_sequential()

# Add the regularizer
model_lesson1 %>%
  layer_dense(units=15, activation='relu', input_shape=8, kernel_regularizer=regularizer_l2(l=0.1)) %>%
  layer_dense(units=5, activation = 'relu') %>%
  layer_dense(units=1)


# Compile the model
model_lesson1 %>%
    compile(optimizer = 'rmsprop', loss = 'mse', metrics = c('accuracy'))

# Fit the model
model_lesson1 %>%
    fit(x = train_x, y = train_y, epochs = 25, batch_size = 32, validation_split=0.2)


# Evaluate the model
score_lesson1 <- model_lesson1 %>%
    evaluate(test_x, test_y)

# Call the accuracy and loss
score_lesson1$acc
score_lesson1$loss


# Define the feature columns
featcols <- feature_columns(
    tf$feature_column$numeric_column("Var"), tf$feature_column$numeric_column("Skew"), 
    tf$feature_column$numeric_column("Kurt"), tf$feature_column$numeric_column("Entropy")
)

# Create the input function 
banknote_input_fn <- function(data){
    input_fn(data, features = c("Var", "Skew", "Kurt", "Entropy"), response = "Class")
}


# Create your dnn_classifier model
mymodel <- dnn_classifier(feature_columns = featcols, hidden_units = c(40, 60, 10), n_classes = 2, 
                          label_vocabulary = c("N", "Y"), dropout = 0.2
                          )

# Train the model
train(mymodel, input_fn = banknote_input_fn(banknote_authentication_train))


# Evaluate your model using the testing dataset
final_evaluation <- evaluate(mymodel, input_fn = banknote_input_fn(banknote_authentication_test))

# Call up the accuracy and precision of your evaluated model
final_evaluation$accuracy
final_evaluation$precision


# Tune the run
runs <- tuning_run(modelsourcecode_script, flags = list(dropout = c(0.2, 0.3, 0.4)))

# View the outcome
runs[order(runs$eval_accuracy, decreasing = TRUE), ]


# Tune the run
runs <- tuning_run(
  modelsourcecode_script, flags = list(dropout = c(0.2, 0.3, 0.4), activation = c("relu", "softmax") )
)

# View the outcome
runs[order(runs$eval_accuracy, decreasing = TRUE), ]

Market Basket Analysis in R

Chapter 1 - Introduction to Market Basket Analysis

Market Basket Introduction:

  • The basket is a collection of items such as a cart of products at a supermarket or the selected products at Amazon
  • Can take basket data in R and count, summarize unique products, plot purchases by product, etc.
  • Can also assess whether there are relationships between items in a basket

Item Combinations:

  • Market basket analysis focuses on what products have been focused rather than what quantities of products have been purchased
    • The empty set is always considered to be a subset of any market basket
    • Often interesting to understand the intersection and union of market baskets - union(A, B)

What is Market Basket Analysis?

  • Can analyze multiple baskets to see if there are associations among products commonly purchased together
    • Place items together, make recommendations that “customers also bought this”, etc.
  • Market basket analysis is sometimes known as association rule-mining

Example code includes:

Online_Retail_2011_Q1 <- readr::read_csv("./RInputFiles/Online_Retail_2011_Q1.xls")
## Parsed with column specification:
## cols(
##   InvoiceNo = col_character(),
##   StockCode = col_character(),
##   Description = col_character(),
##   Quantity = col_double(),
##   InvoiceDate = col_character(),
##   UnitPrice = col_double(),
##   CustomerID = col_double(),
##   Country = col_character()
## )
str(Online_Retail_2011_Q1)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 99602 obs. of  8 variables:
##  $ InvoiceNo  : chr  "539993" "539993" "539993" "539993" ...
##  $ StockCode  : chr  "22386" "21499" "21498" "22379" ...
##  $ Description: chr  "JUMBO BAG PINK POLKADOT" "BLUE POLKADOT WRAP" "RED RETROSPOT WRAP" "RECYCLING BAG RETROSPOT" ...
##  $ Quantity   : num  10 25 25 5 10 10 6 12 6 8 ...
##  $ InvoiceDate: chr  "04/01/2011 10:00" "04/01/2011 10:00" "04/01/2011 10:00" "04/01/2011 10:00" ...
##  $ UnitPrice  : num  1.95 0.42 0.42 2.1 1.25 1.95 3.25 1.45 2.95 1.95 ...
##  $ CustomerID : num  13313 13313 13313 13313 13313 ...
##  $ Country    : chr  "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   InvoiceNo = col_character(),
##   ..   StockCode = col_character(),
##   ..   Description = col_character(),
##   ..   Quantity = col_double(),
##   ..   InvoiceDate = col_character(),
##   ..   UnitPrice = col_double(),
##   ..   CustomerID = col_double(),
##   ..   Country = col_character()
##   .. )
movie_subset <- readr::read_csv("./RInputFiles/Movie_subset.xls")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   userId = col_double(),
##   movieId = col_double(),
##   title = col_character(),
##   year = col_double(),
##   genres = col_character()
## )
str(movie_subset)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 19455 obs. of  6 variables:
##  $ X1     : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ userId : num  1323 1323 1323 1323 1323 ...
##  $ movieId: num  1 3 5 10 11 12 15 16 17 19 ...
##  $ title  : chr  "Toy Story" "Grumpier Old Men" "Father of the Bride Part II" "GoldenEye" ...
##  $ year   : num  1995 1995 1995 1995 1995 ...
##  $ genres : chr  "Adventure|Animation|Children|Comedy|Fantasy" "Comedy|Romance" "Comedy" "Action|Adventure|Thriller" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   X1 = col_double(),
##   ..   userId = col_double(),
##   ..   movieId = col_double(),
##   ..   title = col_character(),
##   ..   year = col_double(),
##   ..   genres = col_character()
##   .. )
# Have a glimpse at the dataset
glimpse(Online_Retail_2011_Q1)
## Observations: 99,602
## Variables: 8
## $ InvoiceNo   <chr> "539993", "539993", "539993", "539993", "539993", "5399...
## $ StockCode   <chr> "22386", "21499", "21498", "22379", "20718", "85099B", ...
## $ Description <chr> "JUMBO BAG PINK POLKADOT", "BLUE POLKADOT WRAP", "RED R...
## $ Quantity    <dbl> 10, 25, 25, 5, 10, 10, 6, 12, 6, 8, 6, 6, 6, 12, 12, 8,...
## $ InvoiceDate <chr> "04/01/2011 10:00", "04/01/2011 10:00", "04/01/2011 10:...
## $ UnitPrice   <dbl> 1.95, 0.42, 0.42, 2.10, 1.25, 1.95, 3.25, 1.45, 2.95, 1...
## $ CustomerID  <dbl> 13313, 13313, 13313, 13313, 13313, 13313, 13313, 13313,...
## $ Country     <chr> "United Kingdom", "United Kingdom", "United Kingdom", "...
# Filter a single basket
One_basket = Online_Retail_2011_Q1 %>%
    filter(InvoiceNo == 540180)

print(One_basket)
## # A tibble: 12 x 8
##    InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID
##    <chr>     <chr>     <chr>          <dbl> <chr>           <dbl>      <dbl>
##  1 540180    85123A    WHITE HANG~        2 05/01/2011~      2.95      15984
##  2 540180    22083     PAPER CHAI~        2 05/01/2011~      2.95      15984
##  3 540180    22759     SET OF 3 N~        4 05/01/2011~      1.65      15984
##  4 540180    21677     HEARTS  ST~        4 05/01/2011~      0.85      15984
##  5 540180    22168     ORGANISER ~        1 05/01/2011~      8.5       15984
##  6 540180    22113     GREY HEART~        4 05/01/2011~      3.75      15984
##  7 540180    84978     HANGING HE~        6 05/01/2011~      1.25      15984
##  8 540180    22558     CLOTHES PE~        1 05/01/2011~      1.49      15984
##  9 540180    22163     HEART STRI~        1 05/01/2011~      2.95      15984
## 10 540180    22164     STRING OF ~        1 05/01/2011~      2.95      15984
## 11 540180    85123A    WHITE HANG~        6 05/01/2011~      2.95      15984
## 12 540180    22297     HEART IVOR~       24 05/01/2011~      1.25      15984
## # ... with 1 more variable: Country <chr>
# Basket size
n_distinct(One_basket$StockCode)
## [1] 11
# Total number of items purchased
One_basket %>% 
    summarize(sum(Quantity))
## # A tibble: 1 x 1
##   `sum(Quantity)`
##             <dbl>
## 1              56
# Plot the total number of items within the basket
ggplot(One_basket, aes(x=reorder(Description, Quantity, function(x) sum(x)), y = Quantity)) + 
    geom_col() + 
    coord_flip() + 
    xlab("Items")

# Number of items
n_items = 10

# Initialize an empty matrix 
combi = matrix(NA, nrow = n_items+1, ncol = 2)

# Loop over all values of k
for (i in 0:n_items){
    combi[i+1, ] = c(i, choose(n_items, i))
}

# Sum over all values of k
sum(combi[, 2])
## [1] 1024
# Total number of possible baskets
2^10
## [1] 1024
# Define number of items 
n_items = 100

# Specify the function to be plotted
fun_combi = function(x) choose(n_items, x)

# Plot the number of combinations
ggplot(data = data.frame(x = 0), mapping = aes(x = x)) +
    stat_function(fun = fun_combi) + xlim(0, n_items)

# Select two baskets
Two_baskets = Online_Retail_2011_Q1 %>%
    filter(InvoiceNo %in% c(540160, 540017))

# Basket size
Two_baskets %>%
    group_by(InvoiceNo) %>%
    summarise(n_total = n(), n_items = n_distinct(StockCode))
## # A tibble: 2 x 3
##   InvoiceNo n_total n_items
##   <chr>       <int>   <int>
## 1 540017         13      13
## 2 540160          3       3
Online_Retail_clean <- Online_Retail_2011_Q1[complete.cases(Online_Retail_2011_Q1), ]
str(Online_Retail_clean)
## Classes 'tbl_df', 'tbl' and 'data.frame':    70097 obs. of  8 variables:
##  $ InvoiceNo  : chr  "539993" "539993" "539993" "539993" ...
##  $ StockCode  : chr  "22386" "21499" "21498" "22379" ...
##  $ Description: chr  "JUMBO BAG PINK POLKADOT" "BLUE POLKADOT WRAP" "RED RETROSPOT WRAP" "RECYCLING BAG RETROSPOT" ...
##  $ Quantity   : num  10 25 25 5 10 10 6 12 6 8 ...
##  $ InvoiceDate: chr  "04/01/2011 10:00" "04/01/2011 10:00" "04/01/2011 10:00" "04/01/2011 10:00" ...
##  $ UnitPrice  : num  1.95 0.42 0.42 2.1 1.25 1.95 3.25 1.45 2.95 1.95 ...
##  $ CustomerID : num  13313 13313 13313 13313 13313 ...
##  $ Country    : chr  "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
# Create dataset with basket counts and inspect results
basket_size = Online_Retail_clean %>%
    group_by(InvoiceNo) %>%
    summarise(n_total = n(), n_items = n_distinct(StockCode))

head(basket_size)
## # A tibble: 6 x 3
##   InvoiceNo n_total n_items
##   <chr>       <int>   <int>
## 1 539993         17      17
## 2 540001          9       9
## 3 540002          4       4
## 4 540003         22      22
## 5 540004          1       1
## 6 540005         16      14
# Calculate average values
basket_size %>% 
    summarize(avg_total_items = mean(n_total), avg_dist_items = mean(n_items))
## # A tibble: 1 x 2
##   avg_total_items avg_dist_items
##             <dbl>          <dbl>
## 1            17.3           16.9
# Distribution of distinct items in baskets
ggplot(basket_size, aes(x=n_items)) +
    geom_histogram() + ggtitle("Distribution of basket sizes")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Number of total and distinct items for HERB MARKER THYME
Online_Retail_clean %>%
    filter(Description == "HERB MARKER THYME")  %>%
    summarise(n_tot_items = n(), n_basket_item = n_distinct(InvoiceNo))
## # A tibble: 1 x 2
##   n_tot_items n_basket_item
##         <int>         <int>
## 1          53            52
# Number of baskets containing both items
Online_Retail_clean %>%
    filter(Description %in% c("HERB MARKER ROSEMARY", "HERB MARKER THYME")) %>%
    group_by(InvoiceNo) %>% 
    summarise(n = n()) %>% 
    filter(n==2) %>% 
    summarise(n_distinct(InvoiceNo))
## # A tibble: 1 x 1
##   `n_distinct(InvoiceNo)`
##                     <int>
## 1                      48

Chapter 2 - Metrics and Techniques in Market Basket Analysis

Transactional Data:

  • Transactions are defined as the activity of buying or selling something
    • Market basket analysis is based on transactional data - each basket is a single transaction containing one or more items
  • Can coerce lists, matrices, and data frames to transactional class data
    • myList <- split(myData\(Product, myData\)OrderID)
    • data_trx <- as(myList, “transactions”)
    • inspect(head(data_trx))
    • image(data_trx)

Metrics in Market Basket Analysis:

  • If someone who buys A typically then also buys B, A is called antecedent and B is called precedent
  • There are several metrics for defining prevalence of items in baskets
    • Support for X is the percentage of baskets that contain item X
    • Confidence for (XY) is defined as support for (XY) divided by support for (X) - how often is XY in the basket conditional on X being in the basket
    • Lift is the strength of the association - defined as support for (XY) divided by support(X) divided by support(Y) - cutoff is 1 for equal likelihood
  • The arules::apriori() function allows for calculating many of these key metrics

The Apriori Algorithm:

  • Association rule mining allows for discovery of relationships in large, transactional datasets
    • Frequent itemset generation - minimum level of support
    • Rule generation using the frequent itemset generation
  • The apriori algorithm uses a bottom-up approach to generate candidate items
    • If an itemset is frequent, then all of its subsets are frequent
    • If an item is infrequent, then all of its supersets are infrequent
  • The basic rule generation algorithm includes
    • Start with high confidence rules with a single precedent
    • Build more complex rules with more items on the right side
  • Example for the initial run of the apriori algorithm
    • support.all = apriori(trans, parameter=list(supp=3/7, target=“frequent itemsets”)

Using Apriori for “if this then that”:

  • Can extract frequent datasets and associated rules
  • The appearance argument of apriori() can be used to select specific itemsets
    • appearance=list(rhs=“Cheese”) # will only extract rules with ‘Cheese’ on rhs
  • Rules are considered redundant if a more general rule (super rule) with the same or higher confidence already exists
    • arules::is.redundant(rules)

Example code includes:

library(arules)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
# Splitting transactions
data_list = split(Online_Retail_clean$Description, Online_Retail_clean$InvoiceNo)

# Transform data into a transactional dataset
Online_trx = as(data_list, "transactions")
## Warning in asMethod(object): removing duplicated items in transactions
# Summary of transactions
summary(Online_trx)
## transactions as itemMatrix in sparse format with
##  4057 rows (elements/itemsets/transactions) and
##  2662 columns (items) and a density of 0.006350527 
## 
## most frequent items:
## WHITE HANGING HEART T-LIGHT HOLDER           REGENCY CAKESTAND 3 TIER 
##                                460                                456 
##   SET OF 3 CAKE TINS PANTRY DESIGN            JUMBO BAG RED RETROSPOT 
##                                432                                292 
##  SET OF 6 SPICE TINS PANTRY DESIGN                            (Other) 
##                                279                              66665 
## 
## element (itemset/transaction) length distribution:
## sizes
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
## 635 271 210 144 153  97 109 120 117 102 119  87  83  88  90 112  83  78 104  71 
##  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
##  77  73  60  53  33  41  45  45  53  43  35  27  28  35  24  22  16  19  28  17 
##  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
##  27  22  18  18  13  14   9  18  14  14  12  13  12  10   9   8   9   7   3  12 
##  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
##   4   3   8   9   4   4   7   6   5   6   6   4   4   4   4   3   2   5   2   2 
##  81  82  83  85  86  87  89  92  93  94  95  96  97  98 100 101 104 106 107 108 
##   1   4   1   2   2   1   6   1   3   1   1   1   1   4   1   2   1   2   2   2 
## 110 111 114 116 118 120 124 130 131 149 151 153 171 227 270 
##   2   1   1   1   1   1   1   1   1   2   1   1   1   1   1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    3.00   11.00   16.91   23.00  270.00 
## 
## includes extended item information - examples:
##                       labels
## 1     10 COLOUR SPACEBOY PEN
## 2 12 COLOURED PARTY BALLOONS
## 3  12 DAISY PEGS IN WOOD BOX
## 
## includes extended transaction information - examples:
##   transactionID
## 1        539993
## 2        540001
## 3        540002
# inspect first 3 transactions
inspect(head(Online_trx, 3))
##     items                                 transactionID
## [1] {BLUE POLKADOT WRAP,                               
##      CAST IRON HOOK GARDEN FORK,                       
##      CHILDRENS APRON APPLES DESIGN,                    
##      COFFEE MUG APPLES DESIGN,                         
##      COFFEE MUG PEARS  DESIGN,                         
##      JAM MAKING SET PRINTED,                           
##      JUMBO BAG PINK POLKADOT,                          
##      JUMBO BAG RED RETROSPOT,                          
##      LOVE HEART NAPKIN BOX,                            
##      PEG BAG APPLES DESIGN,                            
##      RECIPE BOX RETROSPOT,                             
##      RECYCLING BAG RETROSPOT,                          
##      RED RETROSPOT CHILDRENS UMBRELLA,                 
##      RED RETROSPOT SHOPPER BAG,                        
##      RED RETROSPOT WRAP,                               
##      SET OF 6 T-LIGHTS EASTER CHICKS,                  
##      WHITE HANGING HEART T-LIGHT HOLDER}         539993
## [2] {CERAMIC BOWL WITH LOVE HEART DESIGN,              
##      CERAMIC CHERRY CAKE MONEY BANK,                   
##      DOORSTOP RETROSPOT HEART,                         
##      GINGHAM HEART  DOORSTOP RED,                      
##      LARGE CAKE STAND HANGING HEARTS,                  
##      LOVE HEART POCKET WARMER,                         
##      PLACE SETTING WHITE HEART,                        
##      RED HANGING HEART T-LIGHT HOLDER,                 
##      SWEETHEART CERAMIC TRINKET BOX}             540001
## [3] {GARDEN METAL SIGN,                                
##      RED KITCHEN SCALES,                               
##      VICTORIAN SEWING BOX SMALL,                       
##      VINTAGE SNAP CARDS}                         540002
# inspect last 5 transactions
inspect(tail(Online_trx, 5))
##     items                                transactionID
## [1] {RED RETROSPOT BUTTER DISH,                       
##      RED RETROSPOT TEA CUP AND SAUCER,                
##      SMALL RED RETROSPOT MUG IN BOX,                  
##      SMALL WHITE RETROSPOT MUG IN BOX,                
##      STRAWBERRY FAIRY CAKE TEAPOT}             C548503
## [2] {ABC TREASURE BOOK BOX}                    C548508
## [3] {WHITE HANGING HEART T-LIGHT HOLDER}       C548513
## [4] {CREAM CUPID HEARTS COAT HANGER,                  
##      RED RETROSPOT CAKE STAND,                        
##      REGENCY CAKESTAND 3 TIER,                        
##      WOODEN FRAME ANTIQUE WHITE,                      
##      WOODEN PICTURE FRAME WHITE FINISH}        C548532
## [5] {Manual,                                          
##      SILVER HANGING T-LIGHT HOLDER}            C548543
# inspect transaction 10
inspect(Online_trx[10])
##     items                                 transactionID
## [1] {AGED GLASS SILVER T-LIGHT HOLDER,                 
##      FLUTED ANTIQUE CANDLE HOLDER,                     
##      LOVE HEART NAPKIN BOX,                            
##      MULTI COLOUR SILVER T-LIGHT HOLDER,               
##      RED RETROSPOT MUG,                                
##      RED RETROSPOT TRADITIONAL TEAPOT,                 
##      RETROSPOT LARGE MILK JUG,                         
##      SET 20 NAPKINS FAIRY CAKES DESIGN,                
##      SET/20 RED RETROSPOT PAPER NAPKINS,               
##      SET/5 RED RETROSPOT LID GLASS BOWLS,              
##      WHITE HANGING HEART T-LIGHT HOLDER}         540016
# Inspect specific transactions
inspect(Online_trx[c(12, 20, 22)])
##     items                                transactionID
## [1] {12 PENCILS TALL TUBE RED RETROSPOT,              
##      200 RED + WHITE BENDY STRAWS,                    
##      60 CAKE CASES DOLLY GIRL DESIGN,                 
##      72 SWEETHEART FAIRY CAKE CASES,                  
##      BAG 500g SWIRLY MARBLES,                         
##      BROWN CHECK CAT DOORSTOP,                        
##      CALENDAR PAPER CUT DESIGN,                       
##      COFFEE MUG APPLES DESIGN,                        
##      COFFEE MUG PEARS  DESIGN,                        
##      CREAM WALL PLANTER HEART SHAPED,                 
##      ENAMEL FLOWER JUG CREAM,                         
##      KEY FOB , BACK DOOR,                             
##      KEY FOB , GARAGE DESIGN,                         
##      KEY FOB , SHED,                                  
##      MEMO BOARD RETROSPOT  DESIGN,                    
##      PACK OF 12 HEARTS DESIGN TISSUES,                
##      PACK OF 12 TRADITIONAL CRAYONS,                  
##      PENS ASSORTED FUNNY FACE,                        
##      POTTING SHED TEA MUG,                            
##      RED RETROSPOT ROUND CAKE TINS,                   
##      RETROSPOT LAMP,                                  
##      RETROSPOT TEA SET CERAMIC 11 PC,                 
##      ROMANTIC PINKS RIBBONS,                          
##      SET 12 KIDS COLOUR  CHALK STICKS,                
##      SET OF 36 DINOSAUR PAPER DOILIES,                
##      SILVER HANGING T-LIGHT HOLDER,                   
##      TEA TIME PARTY BUNTING,                          
##      VINTAGE SNAKES & LADDERS,                        
##      WHITE WOOD GARDEN PLANT LADDER}            540019
## [2] {BAKING SET 9 PIECE RETROSPOT,                    
##      BREAD BIN DINER STYLE PINK,                      
##      BREAD BIN DINER STYLE RED,                       
##      CHILDS BREAKFAST SET DOLLY GIRL,                 
##      FRENCH ENAMEL POT W LID,                         
##      FRYING PAN RED RETROSPOT,                        
##      GUMBALL MAGAZINE RACK,                           
##      JARDIN ETCHED GLASS CHEESE DISH,                 
##      RED RETROSPOT TRADITIONAL TEAPOT,                
##      RETROSPOT TEA SET CERAMIC 11 PC,                 
##      SET OF 16 VINTAGE RED CUTLERY,                   
##      TRIPLE PHOTO FRAME CORNICE,                      
##      VICTORIAN SEWING BOX LARGE}                540028
## [3] {ALPHABET STENCIL CRAFT,                          
##      ASSORTED COLOUR BIRD ORNAMENT,                   
##      DOORMAT AIRMAIL,                                 
##      DOORMAT I LOVE LONDON,                           
##      DOORMAT RESPECTABLE HOUSE,                       
##      FANNY'S REST STOPMETAL SIGN,                     
##      FELTCRAFT 6 FLOWER FRIENDS,                      
##      FELTCRAFT BUTTERFLY HEARTS,                      
##      HAPPY STENCIL CRAFT,                             
##      HEART OF WICKER LARGE,                           
##      HEART OF WICKER SMALL,                           
##      HOME BUILDING BLOCK WORD,                        
##      HOT BATHS METAL SIGN,                            
##      I'M ON HOLIDAY METAL SIGN,                       
##      JOY WOODEN BLOCK LETTERS,                        
##      LAUNDRY 15C METAL SIGN,                          
##      LAVENDER SCENTED FABRIC HEART,                   
##      LOVE BUILDING BLOCK WORD,                        
##      MAN FLU METAL SIGN,                              
##      METAL SIGN TAKE IT OR LEAVE IT,                  
##      NO JUNK MAIL METAL SIGN,                         
##      PEACE WOODEN BLOCK LETTERS,                      
##      PLEASE ONE PERSON METAL SIGN,                    
##      ROSE FOLKART HEART DECORATIONS,                  
##      UNION JACK FLAG LUGGAGE TAG,                     
##      UNION JACK FLAG PASSPORT COVER,                  
##      VICTORIAN  METAL POSTCARD SPRING,                
##      WAKE UP COCKEREL CALENDAR SIGN,                  
##      WOOD S/3 CABINET ANT WHITE FINISH,               
##      YOU'RE CONFUSING ME METAL SIGN}            540031
# Determine the support of both items with support 0.1
support_rosemary_thyme <- apriori(Online_trx, parameter = list(target = "frequent itemsets", supp = 0.1),
                                  appearance = list(items = c("HERB MARKER ROSEMARY", "HERB MARKER THYME"))
                                  )
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##          NA    0.1    1 none FALSE            TRUE       5     0.1      1
##  maxlen            target   ext
##      10 frequent itemsets FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 405 
## 
## set item appearances ...[2 item(s)] done [0.00s].
## set transactions ...[2 item(s), 4057 transaction(s)] done [0.04s].
## sorting and recoding items ... [0 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 done [0.00s].
## writing ... [0 set(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
# Inspect the itemsets 
inspect(support_rosemary_thyme)

# Determine the support of both items with support 0.01
support_rosemary_thyme <- apriori(Online_trx, parameter = list(target = "frequent itemsets", supp = 0.01),
                                  appearance = list(items = c("HERB MARKER ROSEMARY", "HERB MARKER THYME"))
                                  )
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##          NA    0.1    1 none FALSE            TRUE       5    0.01      1
##  maxlen            target   ext
##      10 frequent itemsets FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 40 
## 
## set item appearances ...[2 item(s)] done [0.00s].
## set transactions ...[2 item(s), 4057 transaction(s)] done [0.02s].
## sorting and recoding items ... [2 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [3 set(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
# Inspect the itemsets 
inspect(support_rosemary_thyme)
##     items                                    support    count
## [1] {HERB MARKER ROSEMARY}                   0.01257087 51   
## [2] {HERB MARKER THYME}                      0.01281735 52   
## [3] {HERB MARKER ROSEMARY,HERB MARKER THYME} 0.01207789 49
# Frequent itemsets for all items
support_all <- apriori(Online_trx, parameter = list(target="frequent itemsets", supp = 0.01))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##          NA    0.1    1 none FALSE            TRUE       5    0.01      1
##  maxlen            target   ext
##      10 frequent itemsets FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 40 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[2662 item(s), 4057 transaction(s)] done [0.03s].
## sorting and recoding items ... [524 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [854 set(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
# Inspect the 5 most frequent items
inspect(head(sort(support_all, by="support"), 5))
##     items                                support    count
## [1] {WHITE HANGING HEART T-LIGHT HOLDER} 0.11338427 460  
## [2] {REGENCY CAKESTAND 3 TIER}           0.11239832 456  
## [3] {SET OF 3 CAKE TINS PANTRY DESIGN}   0.10648262 432  
## [4] {JUMBO BAG RED RETROSPOT}            0.07197437 292  
## [5] {SET OF 6 SPICE TINS PANTRY DESIGN}  0.06877003 279
# Call the apriori function with apropriate parameters
rules_all <- apriori(Online_trx, parameter = list(supp=0.01, conf = 0.4))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.4    0.1    1 none FALSE            TRUE       5    0.01      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 40 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[2662 item(s), 4057 transaction(s)] done [0.03s].
## sorting and recoding items ... [524 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [384 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
# Call the apriori function with apropriate parameters
rules_all <- apriori(Online_trx, parameter = list(supp=0.01, conf = 0.4, minlen=2))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.4    0.1    1 none FALSE            TRUE       5    0.01      2
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 40 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[2662 item(s), 4057 transaction(s)] done [0.04s].
## sorting and recoding items ... [524 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [384 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
# Inspect the rules with highest confidence
inspect(head(sort(rules_all, by="confidence"), 5))
##     lhs                       rhs                       support confidence     lift count
## [1] {HERB MARKER PARSLEY,                                                                
##      HERB MARKER ROSEMARY} => {HERB MARKER THYME}    0.01059896  0.9772727 76.24607    43
## [2] {HERB MARKER ROSEMARY} => {HERB MARKER THYME}    0.01207789  0.9607843 74.95965    49
## [3] {HERB MARKER MINT,                                                                   
##      HERB MARKER ROSEMARY} => {HERB MARKER THYME}    0.01109194  0.9574468 74.69926    45
## [4] {HERB MARKER MINT,                                                                   
##      HERB MARKER THYME}    => {HERB MARKER ROSEMARY} 0.01109194  0.9574468 76.16395    45
## [5] {HERB MARKER PARSLEY,                                                                
##      HERB MARKER THYME}    => {HERB MARKER ROSEMARY} 0.01059896  0.9555556 76.01351    43
# Inspect the rules with highest lift
inspect(head(sort(rules_all, by="lift"), 5))
##     lhs                       rhs                       support confidence     lift count
## [1] {HERB MARKER PARSLEY,                                                                
##      HERB MARKER ROSEMARY} => {HERB MARKER THYME}    0.01059896  0.9772727 76.24607    43
## [2] {HERB MARKER MINT,                                                                   
##      HERB MARKER THYME}    => {HERB MARKER ROSEMARY} 0.01109194  0.9574468 76.16395    45
## [3] {HERB MARKER PARSLEY,                                                                
##      HERB MARKER THYME}    => {HERB MARKER ROSEMARY} 0.01059896  0.9555556 76.01351    43
## [4] {HERB MARKER ROSEMARY} => {HERB MARKER THYME}    0.01207789  0.9607843 74.95965    49
## [5] {HERB MARKER THYME}    => {HERB MARKER ROSEMARY} 0.01207789  0.9423077 74.95965    49
# Find the confidence and lift measures
rules_rosemary_rhs <- apriori(Online_trx, parameter = list(supp=0.01, conf=0.5, minlen=2),
                              appearance = list(rhs="HERB MARKER ROSEMARY", default = "lhs")
                              )
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.5    0.1    1 none FALSE            TRUE       5    0.01      2
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 40 
## 
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[2662 item(s), 4057 transaction(s)] done [0.04s].
## sorting and recoding items ... [524 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [7 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
# Inspect the rules
inspect(rules_rosemary_rhs)
##     lhs                      rhs                       support confidence     lift count
## [1] {HERB MARKER BASIL}   => {HERB MARKER ROSEMARY} 0.01035248  0.8936170 71.08636    42
## [2] {HERB MARKER PARSLEY} => {HERB MARKER ROSEMARY} 0.01084545  0.8979592 71.43177    44
## [3] {HERB MARKER THYME}   => {HERB MARKER ROSEMARY} 0.01207789  0.9423077 74.95965    49
## [4] {HERB MARKER MINT}    => {HERB MARKER ROSEMARY} 0.01158491  0.8867925 70.54347    47
## [5] {HERB MARKER PARSLEY,                                                               
##      HERB MARKER THYME}   => {HERB MARKER ROSEMARY} 0.01059896  0.9555556 76.01351    43
## [6] {HERB MARKER MINT,                                                                  
##      HERB MARKER PARSLEY} => {HERB MARKER ROSEMARY} 0.01010599  0.9318182 74.12522    41
## [7] {HERB MARKER MINT,                                                                  
##      HERB MARKER THYME}   => {HERB MARKER ROSEMARY} 0.01109194  0.9574468 76.16395    45
# Find the confidence and lift measures
rules_rosemary_lhs <- apriori(Online_trx, parameter = list(supp=0.01, conf=0.5, minlen=2),
                              appearance = list(lhs="HERB MARKER ROSEMARY", default = "rhs")
                              )
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.5    0.1    1 none FALSE            TRUE       5    0.01      2
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 40 
## 
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[2662 item(s), 4057 transaction(s)] done [0.04s].
## sorting and recoding items ... [524 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [4 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
# Inspect the rules
inspect(rules_rosemary_lhs)
##     lhs                       rhs                   support    confidence
## [1] {HERB MARKER ROSEMARY} => {HERB MARKER BASIL}   0.01035248 0.8235294 
## [2] {HERB MARKER ROSEMARY} => {HERB MARKER PARSLEY} 0.01084545 0.8627451 
## [3] {HERB MARKER ROSEMARY} => {HERB MARKER THYME}   0.01207789 0.9607843 
## [4] {HERB MARKER ROSEMARY} => {HERB MARKER MINT}    0.01158491 0.9215686 
##     lift     count
## [1] 71.08636 42   
## [2] 71.43177 44   
## [3] 74.95965 49   
## [4] 70.54347 47
# Create the union of the rules and inspect
rules_rosemary <- arules::union(rules_rosemary_rhs, rules_rosemary_lhs)
inspect(rules_rosemary)
##      lhs                       rhs                       support confidence     lift count
## [1]  {HERB MARKER BASIL}    => {HERB MARKER ROSEMARY} 0.01035248  0.8936170 71.08636    42
## [2]  {HERB MARKER PARSLEY}  => {HERB MARKER ROSEMARY} 0.01084545  0.8979592 71.43177    44
## [3]  {HERB MARKER THYME}    => {HERB MARKER ROSEMARY} 0.01207789  0.9423077 74.95965    49
## [4]  {HERB MARKER MINT}     => {HERB MARKER ROSEMARY} 0.01158491  0.8867925 70.54347    47
## [5]  {HERB MARKER PARSLEY,                                                                
##       HERB MARKER THYME}    => {HERB MARKER ROSEMARY} 0.01059896  0.9555556 76.01351    43
## [6]  {HERB MARKER MINT,                                                                   
##       HERB MARKER PARSLEY}  => {HERB MARKER ROSEMARY} 0.01010599  0.9318182 74.12522    41
## [7]  {HERB MARKER MINT,                                                                   
##       HERB MARKER THYME}    => {HERB MARKER ROSEMARY} 0.01109194  0.9574468 76.16395    45
## [8]  {HERB MARKER ROSEMARY} => {HERB MARKER BASIL}    0.01035248  0.8235294 71.08636    42
## [9]  {HERB MARKER ROSEMARY} => {HERB MARKER PARSLEY}  0.01084545  0.8627451 71.43177    44
## [10] {HERB MARKER ROSEMARY} => {HERB MARKER THYME}    0.01207789  0.9607843 74.95965    49
## [11] {HERB MARKER ROSEMARY} => {HERB MARKER MINT}     0.01158491  0.9215686 70.54347    47
# Apply the apriori function to the Online retail dataset
rules_online <- apriori(Online_trx, parameter = list(supp = 0.01, conf = 0.8, minlen = 2))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.8    0.1    1 none FALSE            TRUE       5    0.01      2
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 40 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[2662 item(s), 4057 transaction(s)] done [0.06s].
## sorting and recoding items ... [524 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.01s].
## writing ... [52 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
# Inspect the first 5 rules
inspect(head(rules_online, 5))
##     lhs                       rhs                    support    confidence
## [1] {HERB MARKER BASIL}    => {HERB MARKER ROSEMARY} 0.01035248 0.8936170 
## [2] {HERB MARKER ROSEMARY} => {HERB MARKER BASIL}    0.01035248 0.8235294 
## [3] {HERB MARKER BASIL}    => {HERB MARKER THYME}    0.01059896 0.9148936 
## [4] {HERB MARKER THYME}    => {HERB MARKER BASIL}    0.01059896 0.8269231 
## [5] {HERB MARKER BASIL}    => {HERB MARKER MINT}     0.01059896 0.9148936 
##     lift     count
## [1] 71.08636 42   
## [2] 71.08636 42   
## [3] 71.37930 43   
## [4] 71.37930 43   
## [5] 70.03252 43
# Inspect the first 5 rules with highest lift
inspect(head(sort(rules_online, by="lift"), 5))
##     lhs                       rhs                       support confidence     lift count
## [1] {HERB MARKER PARSLEY,                                                                
##      HERB MARKER ROSEMARY} => {HERB MARKER THYME}    0.01059896  0.9772727 76.24607    43
## [2] {HERB MARKER MINT,                                                                   
##      HERB MARKER THYME}    => {HERB MARKER ROSEMARY} 0.01109194  0.9574468 76.16395    45
## [3] {HERB MARKER PARSLEY,                                                                
##      HERB MARKER THYME}    => {HERB MARKER ROSEMARY} 0.01059896  0.9555556 76.01351    43
## [4] {HERB MARKER ROSEMARY} => {HERB MARKER THYME}    0.01207789  0.9607843 74.95965    49
## [5] {HERB MARKER THYME}    => {HERB MARKER ROSEMARY} 0.01207789  0.9423077 74.95965    49
# Transform the rules back to a dataframe
rules_online_df <- as(rules_online, "data.frame")

# Check the first records 
head(rules_online_df)
##                                           rules    support confidence     lift
## 1 {HERB MARKER BASIL} => {HERB MARKER ROSEMARY} 0.01035248  0.8936170 71.08636
## 2 {HERB MARKER ROSEMARY} => {HERB MARKER BASIL} 0.01035248  0.8235294 71.08636
## 3    {HERB MARKER BASIL} => {HERB MARKER THYME} 0.01059896  0.9148936 71.37930
## 4    {HERB MARKER THYME} => {HERB MARKER BASIL} 0.01059896  0.8269231 71.37930
## 5     {HERB MARKER BASIL} => {HERB MARKER MINT} 0.01059896  0.9148936 70.03252
## 6     {HERB MARKER MINT} => {HERB MARKER BASIL} 0.01059896  0.8113208 70.03252
##   count
## 1    42
## 2    42
## 3    43
## 4    43
## 5    43
## 6    43
# Apply the apriori function to the Online retail dataset
rules_online <- apriori(Online_trx, parameter = list(supp = 0.01, conf = 0.8, minlen = 2))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.8    0.1    1 none FALSE            TRUE       5    0.01      2
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 40 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[2662 item(s), 4057 transaction(s)] done [0.06s].
## sorting and recoding items ... [524 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [52 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
# Inspect the first rules
inspect(head(rules_online))
##     lhs                       rhs                    support    confidence
## [1] {HERB MARKER BASIL}    => {HERB MARKER ROSEMARY} 0.01035248 0.8936170 
## [2] {HERB MARKER ROSEMARY} => {HERB MARKER BASIL}    0.01035248 0.8235294 
## [3] {HERB MARKER BASIL}    => {HERB MARKER THYME}    0.01059896 0.9148936 
## [4] {HERB MARKER THYME}    => {HERB MARKER BASIL}    0.01059896 0.8269231 
## [5] {HERB MARKER BASIL}    => {HERB MARKER MINT}     0.01059896 0.9148936 
## [6] {HERB MARKER MINT}     => {HERB MARKER BASIL}    0.01059896 0.8113208 
##     lift     count
## [1] 71.08636 42   
## [2] 71.08636 42   
## [3] 71.37930 43   
## [4] 71.37930 43   
## [5] 70.03252 43   
## [6] 70.03252 43
# Support of herb markers
supp_herb_markers <- apriori(Online_trx, parameter = list(target = "frequent itemsets", supp = 0.01),
                             appearance = list(items = c("HERB MARKER THYME", "HERB MARKER ROSEMARY"))
                             )
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##          NA    0.1    1 none FALSE            TRUE       5    0.01      1
##  maxlen            target   ext
##      10 frequent itemsets FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 40 
## 
## set item appearances ...[2 item(s)] done [0.00s].
## set transactions ...[2 item(s), 4057 transaction(s)] done [0.04s].
## sorting and recoding items ... [2 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [3 set(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
# Inspect frequent itemsets
inspect(supp_herb_markers)
##     items                                    support    count
## [1] {HERB MARKER ROSEMARY}                   0.01257087 51   
## [2] {HERB MARKER THYME}                      0.01281735 52   
## [3] {HERB MARKER ROSEMARY,HERB MARKER THYME} 0.01207789 49
# Extract rules for HERB MARKER THYME on rhs of rule
rules_thyme_marker_rhs <- apriori(Online_trx, parameter = list(supp=0.01, conf=0.8, minlen=2), 
                                  appearance = list(rhs = "HERB MARKER THYME"), control = list(verbose=F)
                                  )

# Inspect rules
inspect(rules_thyme_marker_rhs)
##     lhs                       rhs                    support confidence     lift count
## [1] {HERB MARKER BASIL}    => {HERB MARKER THYME} 0.01059896  0.9148936 71.37930    43
## [2] {HERB MARKER PARSLEY}  => {HERB MARKER THYME} 0.01109194  0.9183673 71.65031    45
## [3] {HERB MARKER ROSEMARY} => {HERB MARKER THYME} 0.01207789  0.9607843 74.95965    49
## [4] {HERB MARKER MINT}     => {HERB MARKER THYME} 0.01158491  0.8867925 69.18687    47
## [5] {HERB MARKER PARSLEY,                                                             
##      HERB MARKER ROSEMARY} => {HERB MARKER THYME} 0.01059896  0.9772727 76.24607    43
## [6] {HERB MARKER MINT,                                                                
##      HERB MARKER PARSLEY}  => {HERB MARKER THYME} 0.01010599  0.9318182 72.69974    41
## [7] {HERB MARKER MINT,                                                                
##      HERB MARKER ROSEMARY} => {HERB MARKER THYME} 0.01109194  0.9574468 74.69926    45
# Extract rules for HERB MARKER THYME on lhs of rule
rules_thyme_marker_lhs <- apriori(Online_trx, parameter = list(supp=0.01, conf=0.8, minlen=2), 
                                  appearance = list(lhs = "HERB MARKER THYME"), control = list (verbose=F)
                                  )

# Inspect rules
inspect(rules_thyme_marker_lhs)
##     lhs                    rhs                    support    confidence
## [1] {HERB MARKER THYME} => {HERB MARKER BASIL}    0.01059896 0.8269231 
## [2] {HERB MARKER THYME} => {HERB MARKER PARSLEY}  0.01109194 0.8653846 
## [3] {HERB MARKER THYME} => {HERB MARKER ROSEMARY} 0.01207789 0.9423077 
## [4] {HERB MARKER THYME} => {HERB MARKER MINT}     0.01158491 0.9038462 
##     lift     count
## [1] 71.37930 43   
## [2] 71.65031 45   
## [3] 74.95965 49   
## [4] 69.18687 47
# Apply the apriori function to the Online retail dataset
rules <- apriori(Online_trx, parameter = list(supp = 0.01, conf = 0.8, minlen = 2))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.8    0.1    1 none FALSE            TRUE       5    0.01      2
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 40 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[2662 item(s), 4057 transaction(s)] done [0.05s].
## sorting and recoding items ... [524 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.01s].
## writing ... [52 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
# Inspect the first 5 rules
inspect(head(rules))
##     lhs                       rhs                    support    confidence
## [1] {HERB MARKER BASIL}    => {HERB MARKER ROSEMARY} 0.01035248 0.8936170 
## [2] {HERB MARKER ROSEMARY} => {HERB MARKER BASIL}    0.01035248 0.8235294 
## [3] {HERB MARKER BASIL}    => {HERB MARKER THYME}    0.01059896 0.9148936 
## [4] {HERB MARKER THYME}    => {HERB MARKER BASIL}    0.01059896 0.8269231 
## [5] {HERB MARKER BASIL}    => {HERB MARKER MINT}     0.01059896 0.9148936 
## [6] {HERB MARKER MINT}     => {HERB MARKER BASIL}    0.01059896 0.8113208 
##     lift     count
## [1] 71.08636 42   
## [2] 71.08636 42   
## [3] 71.37930 43   
## [4] 71.37930 43   
## [5] 70.03252 43   
## [6] 70.03252 43
# Find out redundant of rules
redundant_rules <- is.redundant(rules)

# Inspect the non redundant rules
non_redundant_rules <- rules[!redundant_rules]
inspect(head(non_redundant_rules))
##     lhs                       rhs                    support    confidence
## [1] {HERB MARKER BASIL}    => {HERB MARKER ROSEMARY} 0.01035248 0.8936170 
## [2] {HERB MARKER ROSEMARY} => {HERB MARKER BASIL}    0.01035248 0.8235294 
## [3] {HERB MARKER BASIL}    => {HERB MARKER THYME}    0.01059896 0.9148936 
## [4] {HERB MARKER THYME}    => {HERB MARKER BASIL}    0.01059896 0.8269231 
## [5] {HERB MARKER BASIL}    => {HERB MARKER MINT}     0.01059896 0.9148936 
## [6] {HERB MARKER MINT}     => {HERB MARKER BASIL}    0.01059896 0.8113208 
##     lift     count
## [1] 71.08636 42   
## [2] 71.08636 42   
## [3] 71.37930 43   
## [4] 71.37930 43   
## [5] 70.03252 43   
## [6] 70.03252 43

Chapter 3 - Visualization in Market Basket Analysis

Items in the Basket:

  • The item frequency plot is a common starting point for visualizations
    • itemFrequencyPlot(data_trx, main=, topN=, type=“absolute”) # type=‘absolute’ is the traditional barplot while type=‘relative’ will give relative counts; topN orders high to low and limits the number displayed
    • Can add col=, ylab=, cex.names=, horiz=TRUE (for horizontal plots), etc.

Visualizing Metrics:

  • Can use arulesViz::inspectDT(rules) to extract rules in html format that can be viewed using a widget
    • Can also use plot(rules) for a scatterplot of the rules - support vs. confidence, colored by lift
  • The arules::plot() has four main arguments that can be used
    • rulesObject=
    • measure=
    • shading= # can choose ‘support’ to shade by support
    • method= # chooses the type of plot (default is scatter)
  • There are many supported methods of plots that can be specified in method=
    • The ‘two-key plot’ has support vs. confidence, colored by order
    • Can add jitter such as jitter=2 to prevent overlapping and better see the true density of points
  • Can use plotly to have an interactive look at the rules
    • plot(rules, engine=“plotly”)

Rules to Graph-Based Visualizations:

  • Can visualize rules using graph methods as part of plot()
    • plot(rules, method=“graph”, engine=“htmlwidget”)
    • Arrows go lhs -> rule -> rhs
  • Can select either a rule or an item from the drop-down in the ULS of the graph
  • Can also look at subgraphs by filtering prior to passing rules to the plot() call
  • Can also save graphs for later use

Alternative Rule Plots:

  • Can use grouped methods for plotting
    • plot(rules, method=“grouped”, measure=“lift”, shading=“confidence”)
  • Can use the parallel coordinate plotting method
    • plot(rules, method=“paracoord”) # arrows point to consequent condition
  • Can use ruleExplorer(rules) to open a Shiny app with all of the possible plots, controllable by widgets

Example code includes:

# Display items horizontally
itemFrequencyPlot(Online_trx, topN = 5, horiz = TRUE)

# Changing the font of the items
itemFrequencyPlot(Online_trx, topN = 10, col = rainbow(10), type = "relative", horiz = TRUE, 
                  main = "Relative Item Frequency Plot" ,xlab = "Frequency", cex.names = 0.8
                  )

library(arulesViz)
## Loading required package: grid
## Registered S3 method overwritten by 'seriation':
##   method         from 
##   reorder.hclust gclus
# Inspection of the rules
inspectDT(rules_online)
# Create a standard scatterplot
plot(rules_online)
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.

# Change the axis and legend of the scatterplot
plot(rules_online, measure = c("confidence", "lift"), shading = "support")
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.

# Plot a two-key plot
plot(rules_online, method = "two-key plot")
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.

# Plot a matrix plot
plot(rules_online, method = "matrix")
## Itemsets in Antecedent (LHS)
##  [1] "{HERB MARKER MINT,HERB MARKER THYME}"                                                     
##  [2] "{HERB MARKER PARSLEY,HERB MARKER ROSEMARY}"                                               
##  [3] "{HERB MARKER MINT,HERB MARKER ROSEMARY}"                                                  
##  [4] "{HERB MARKER MINT,HERB MARKER PARSLEY}"                                                   
##  [5] "{HERB MARKER PARSLEY,HERB MARKER THYME}"                                                  
##  [6] "{HERB MARKER ROSEMARY}"                                                                   
##  [7] "{HERB MARKER THYME}"                                                                      
##  [8] "{HERB MARKER ROSEMARY,HERB MARKER THYME}"                                                 
##  [9] "{HERB MARKER BASIL}"                                                                      
## [10] "{HERB MARKER PARSLEY}"                                                                    
## [11] "{HERB MARKER MINT}"                                                                       
## [12] "{CHILDS GARDEN TROWEL PINK}"                                                              
## [13] "{CHILDS GARDEN TROWEL BLUE}"                                                              
## [14] "{SET/10 BLUE POLKADOT PARTY CANDLES}"                                                     
## [15] "{POPPY'S PLAYHOUSE LIVINGROOM}"                                                           
## [16] "{COFFEE MUG PEARS  DESIGN}"                                                               
## [17] "{SET/20 RED RETROSPOT PAPER NAPKINS,SET/6 RED SPOTTY PAPER PLATES}"                       
## [18] "{SET/20 RED RETROSPOT PAPER NAPKINS,SET/6 RED SPOTTY PAPER CUPS}"                         
## [19] "{SET/6 RED SPOTTY PAPER CUPS}"                                                            
## [20] "{BLUE FELT EASTER EGG BASKET}"                                                            
## [21] "{KITCHEN METAL SIGN}"                                                                     
## [22] "{PINK REGENCY TEACUP AND SAUCER,REGENCY CAKESTAND 3 TIER,ROSES REGENCY TEACUP AND SAUCER}"
## [23] "{PINK REGENCY TEACUP AND SAUCER,ROSES REGENCY TEACUP AND SAUCER}"                         
## [24] "{PINK REGENCY TEACUP AND SAUCER,REGENCY CAKESTAND 3 TIER}"                                
## [25] "{GREEN REGENCY TEACUP AND SAUCER,PINK REGENCY TEACUP AND SAUCER,REGENCY CAKESTAND 3 TIER}"
## [26] "{REGENCY CAKESTAND 3 TIER,ROSES REGENCY TEACUP AND SAUCER}"                               
## [27] "{GREEN REGENCY TEACUP AND SAUCER,PINK REGENCY TEACUP AND SAUCER}"                         
## [28] "{GREEN REGENCY TEACUP AND SAUCER,REGENCY CAKESTAND 3 TIER}"                               
## [29] "{SET OF 3 CAKE TINS PANTRY DESIGN,SET OF 6 HERB TINS SKETCHBOOK}"                         
## [30] "{JUMBO  BAG BAROQUE BLACK WHITE,JUMBO STORAGE BAG SUKI}"                                  
## [31] "{CANDLEHOLDER PINK HANGING HEART,RED HANGING HEART T-LIGHT HOLDER}"                       
## Itemsets in Consequent (RHS)
##  [1] "{WHITE HANGING HEART T-LIGHT HOLDER}"
##  [2] "{JUMBO BAG RED RETROSPOT}"           
##  [3] "{SET OF 6 SPICE TINS PANTRY DESIGN}" 
##  [4] "{ROSES REGENCY TEACUP AND SAUCER}"   
##  [5] "{GREEN REGENCY TEACUP AND SAUCER}"   
##  [6] "{BATHROOM METAL SIGN}"               
##  [7] "{CREAM FELT EASTER EGG BASKET}"      
##  [8] "{SET/6 RED SPOTTY PAPER PLATES}"     
##  [9] "{POPPY'S PLAYHOUSE KITCHEN}"         
## [10] "{SET/6 RED SPOTTY PAPER CUPS}"       
## [11] "{COFFEE MUG APPLES DESIGN}"          
## [12] "{SET/10 PINK POLKADOT PARTY CANDLES}"
## [13] "{POPPY'S PLAYHOUSE BEDROOM}"         
## [14] "{CHILDS GARDEN TROWEL PINK}"         
## [15] "{CHILDS GARDEN TROWEL BLUE}"         
## [16] "{HERB MARKER MINT}"                  
## [17] "{HERB MARKER BASIL}"                 
## [18] "{HERB MARKER PARSLEY}"               
## [19] "{HERB MARKER THYME}"                 
## [20] "{HERB MARKER ROSEMARY}"

# Plot a matrix plot with confidence as color coding
plot(rules_online, method = "matrix", shading = "confidence")
## Itemsets in Antecedent (LHS)
##  [1] "{HERB MARKER PARSLEY,HERB MARKER ROSEMARY}"                                               
##  [2] "{PINK REGENCY TEACUP AND SAUCER,REGENCY CAKESTAND 3 TIER,ROSES REGENCY TEACUP AND SAUCER}"
##  [3] "{HERB MARKER PARSLEY,HERB MARKER THYME}"                                                  
##  [4] "{HERB MARKER MINT,HERB MARKER PARSLEY}"                                                   
##  [5] "{PINK REGENCY TEACUP AND SAUCER,ROSES REGENCY TEACUP AND SAUCER}"                         
##  [6] "{SET/20 RED RETROSPOT PAPER NAPKINS,SET/6 RED SPOTTY PAPER CUPS}"                         
##  [7] "{HERB MARKER MINT,HERB MARKER ROSEMARY}"                                                  
##  [8] "{HERB MARKER MINT,HERB MARKER THYME}"                                                     
##  [9] "{HERB MARKER BASIL}"                                                                      
## [10] "{HERB MARKER PARSLEY}"                                                                    
## [11] "{HERB MARKER ROSEMARY,HERB MARKER THYME}"                                                 
## [12] "{HERB MARKER ROSEMARY}"                                                                   
## [13] "{SET/6 RED SPOTTY PAPER CUPS}"                                                            
## [14] "{HERB MARKER THYME}"                                                                      
## [15] "{GREEN REGENCY TEACUP AND SAUCER,PINK REGENCY TEACUP AND SAUCER,REGENCY CAKESTAND 3 TIER}"
## [16] "{CANDLEHOLDER PINK HANGING HEART,RED HANGING HEART T-LIGHT HOLDER}"                       
## [17] "{PINK REGENCY TEACUP AND SAUCER,REGENCY CAKESTAND 3 TIER}"                                
## [18] "{BLUE FELT EASTER EGG BASKET}"                                                            
## [19] "{SET/10 BLUE POLKADOT PARTY CANDLES}"                                                     
## [20] "{HERB MARKER MINT}"                                                                       
## [21] "{GREEN REGENCY TEACUP AND SAUCER,PINK REGENCY TEACUP AND SAUCER}"                         
## [22] "{SET/20 RED RETROSPOT PAPER NAPKINS,SET/6 RED SPOTTY PAPER PLATES}"                       
## [23] "{CHILDS GARDEN TROWEL BLUE}"                                                              
## [24] "{REGENCY CAKESTAND 3 TIER,ROSES REGENCY TEACUP AND SAUCER}"                               
## [25] "{POPPY'S PLAYHOUSE LIVINGROOM}"                                                           
## [26] "{KITCHEN METAL SIGN}"                                                                     
## [27] "{CHILDS GARDEN TROWEL PINK}"                                                              
## [28] "{GREEN REGENCY TEACUP AND SAUCER,REGENCY CAKESTAND 3 TIER}"                               
## [29] "{SET OF 3 CAKE TINS PANTRY DESIGN,SET OF 6 HERB TINS SKETCHBOOK}"                         
## [30] "{COFFEE MUG PEARS  DESIGN}"                                                               
## [31] "{JUMBO  BAG BAROQUE BLACK WHITE,JUMBO STORAGE BAG SUKI}"                                  
## Itemsets in Consequent (RHS)
##  [1] "{POPPY'S PLAYHOUSE KITCHEN}"         
##  [2] "{JUMBO BAG RED RETROSPOT}"           
##  [3] "{COFFEE MUG APPLES DESIGN}"          
##  [4] "{SET OF 6 SPICE TINS PANTRY DESIGN}" 
##  [5] "{CHILDS GARDEN TROWEL BLUE}"         
##  [6] "{BATHROOM METAL SIGN}"               
##  [7] "{HERB MARKER BASIL}"                 
##  [8] "{CHILDS GARDEN TROWEL PINK}"         
##  [9] "{SET/6 RED SPOTTY PAPER CUPS}"       
## [10] "{POPPY'S PLAYHOUSE BEDROOM}"         
## [11] "{ROSES REGENCY TEACUP AND SAUCER}"   
## [12] "{SET/10 PINK POLKADOT PARTY CANDLES}"
## [13] "{HERB MARKER PARSLEY}"               
## [14] "{CREAM FELT EASTER EGG BASKET}"      
## [15] "{WHITE HANGING HEART T-LIGHT HOLDER}"
## [16] "{GREEN REGENCY TEACUP AND SAUCER}"   
## [17] "{SET/6 RED SPOTTY PAPER PLATES}"     
## [18] "{HERB MARKER MINT}"                  
## [19] "{HERB MARKER ROSEMARY}"              
## [20] "{HERB MARKER THYME}"

# Create a HTML widget of the graph of rules
plot(rules_online, method = "graph", engine = "htmlwidget")
# HTML widget graph for the highest confidence rules
plot(head(sort(rules_online, by="confidence"), 5), method = "graph", engine = "htmlwidget")
# HTML widget graph for rules with lowest lift
plot(tail(sort(rules_online, by="lift"), 5), method = "graph", engine = "htmlwidget")
# Create an interactive graph visualization
rules_html <- plot(rules_online, method = "graph", engine = "htmlwidget")

# Save the interactive graph as an html file
# htmlwidgets::saveWidget(rules_html, file = "./RInputFiles/rules_grocery.html")


# Plot a group matrix-based visualization
# plot(subset_rules, method = "grouped")

# Change the arguments of group matrix-based visualization
# plot(subset_rules, method = "grouped", measure = "lift", shading = "confidence")


# Plotting the parallel coordinate plots
plot(rules_online, method = "paracoord")

# Parallel coordinate plots with confidence as color coding
plot(rules_online, method = "paracoord", shading = "confidence")


Chapter 4 - Case Study: Market Basket with Movies

Recap on Transactions:

  • Market basket analysis is based on whch items are bought together rather than the quantity of each item purchased
  • Can use the dataset Groceries from the library arules for an example
  • Can use the crossTable(x, sort=TRUE) function to get the joint relationships of the various items
    • crossTable(x, measure=“chi”)
    • crossTable(x, measure=“lift”, sort=TRUE)
  • Can get the counts of various items by using x[‘item1’, ‘item2’]

Mining Association Rules:

  • Can use arules::apriori() to find key itemsets and combinations
    • Using target=“rules” will pull out the rules, which should be paired with sort(), head(), and tail()
  • Can use varying confidence levels and loop over them
  • Can use %ain% to mean ‘all items in’ or %in% to mean ‘any items in’

Visualizing Transactions and Rules:

  • Can create plots using the method= argument
  • Can use shiny by way of ruleExplorer(rules)
    • Not always recommended for large data as calculation times can be lengthy

Making the most of Market Basket Analysis:

  • Can use market basket analysis to cluster users
  • Example of trying to understand what purchased items are associated with a purchase of yogurt

Wrap Up:

  • Support, confidence, and list are the main methods for classifying an association
  • Visualization and plotting are key skills
  • Recommendation engines can be based on market basket analysis
  • Can extend with information about transaction timing, location, etc., for greater insights
  • Be careful to sort and to look at only subsets of rules

Example code includes:

# Have a glimpse at the dataset
movie_subset %>% 
    glimpse()
## Observations: 19,455
## Variables: 6
## $ X1      <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, ...
## $ userId  <dbl> 1323, 1323, 1323, 1323, 1323, 1323, 1323, 1323, 1323, 1323,...
## $ movieId <dbl> 1, 3, 5, 10, 11, 12, 15, 16, 17, 19, 21, 22, 23, 29, 31, 34...
## $ title   <chr> "Toy Story", "Grumpier Old Men", "Father of the Bride Part ...
## $ year    <dbl> 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995,...
## $ genres  <chr> "Adventure|Animation|Children|Comedy|Fantasy", "Comedy|Roma...
# Calculate the number of distinct users and movies
n_distinct(movie_subset$userId)
## [1] 100
n_distinct(movie_subset$movieId)
## [1] 4598
# Distribution of the number of movies watched by users
movie_subset %>%
    group_by(userId) %>% 
    summarize(nb_movies = n_distinct(movieId)) %>%
    ggplot(aes(x=nb_movies)) +
    geom_histogram() + 
    ggtitle("Distribution of number of movies watched")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Split dataset into movies and users
data_list <- split(movie_subset$title, movie_subset$userId)

# Transform data into a transactional dataset
movie_trx <- as(data_list, "transactions")
## Warning in asMethod(object): removing duplicated items in transactions
# Plot of the item matrix
image(movie_trx[1:100,1:100])

# Setting the plot configuration option
par(mfrow=c(2, 1))

# Plot the relative and absolute item frequency plot
itemFrequencyPlot(movie_trx, type = "relative", topN = 10, horiz = TRUE, main = 'Relative item frequency')
itemFrequencyPlot(movie_trx, type = "absolute", topN = 10, horiz = TRUE, main = 'Absolute item frequency')

par(mfrow=c(1, 1))


# Extract the set of most frequent itemsets
itemsets <- apriori(movie_trx, parameter = list(support = 0.4, target = 'frequent itemsets'))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##          NA    0.1    1 none FALSE            TRUE       5     0.4      1
##  maxlen            target   ext
##      10 frequent itemsets FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 40 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [15 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [16 set(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
# Inspect the five most popular items
arules::inspect(sort(itemsets, by='support', decreasing = TRUE)[1:5])
##     items                       support count
## [1] {Matrix, The}               0.60    60   
## [2] {American Beauty}           0.57    57   
## [3] {Fight Club}                0.54    54   
## [4] {Silence of the Lambs, The} 0.50    50   
## [5] {Shawshank Redemption, The} 0.48    48
# Extract the set of most frequent itemsets
itemsets_minlen2 <- apriori(movie_trx, parameter = list(support = 0.3, minlen = 2, target = 'frequent'))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##          NA    0.1    1 none FALSE            TRUE       5     0.3      2
##  maxlen            target   ext
##      10 frequent itemsets FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 30 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [56 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [115 set(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
# Inspect the five most popular items
arules::inspect(sort(itemsets_minlen2, by='support', decreasing = TRUE)[1:5])
##     items                                                support count
## [1] {Matrix, The,                                                     
##      Silence of the Lambs, The}                             0.40    40
## [2] {Lord of the Rings: The Fellowship of the Ring, The,              
##      Lord of the Rings: The Two Towers, The}                0.38    38
## [3] {American Beauty,                                                 
##      Pulp Fiction}                                          0.38    38
## [4] {Pulp Fiction,                                                    
##      Silence of the Lambs, The}                             0.38    38
## [5] {Matrix, The,                                                     
##      Star Wars: Episode IV - A New Hope}                    0.38    38
# Set of confidence levels
confidenceLevels <- seq(from=0.95, to=0.5, by=-0.05)

# Create empty vector
rules_sup04 <- NULL
rules_sup03 <- NULL

# Apriori algorithm with a support level of 40% and 30%
for (i in 1:length(confidenceLevels)) {
    rules_sup04[i] = length(apriori(movie_trx, 
                                    parameter=list(sup=0.4, conf=confidenceLevels[i], target="rules")
                                    )
                            )
    rules_sup03[i] = length(apriori(movie_trx, 
                                    parameter=list(sup=0.3, conf=confidenceLevels[i], target="rules")
                                    )
                            )
}
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##        0.95    0.1    1 none FALSE            TRUE       5     0.4      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 40 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [15 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [0 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##        0.95    0.1    1 none FALSE            TRUE       5     0.3      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 30 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [56 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [10 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.9    0.1    1 none FALSE            TRUE       5     0.4      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 40 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [15 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [0 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.9    0.1    1 none FALSE            TRUE       5     0.3      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 30 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [56 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [26 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##        0.85    0.1    1 none FALSE            TRUE       5     0.4      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 40 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [15 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [0 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##        0.85    0.1    1 none FALSE            TRUE       5     0.3      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 30 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [56 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [52 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.8    0.1    1 none FALSE            TRUE       5     0.4      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 40 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.02s].
## sorting and recoding items ... [15 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [1 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.8    0.1    1 none FALSE            TRUE       5     0.3      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 30 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [56 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [90 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##        0.75    0.1    1 none FALSE            TRUE       5     0.4      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 40 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [15 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [1 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##        0.75    0.1    1 none FALSE            TRUE       5     0.3      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 30 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [56 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [129 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.7    0.1    1 none FALSE            TRUE       5     0.4      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 40 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [15 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [1 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.7    0.1    1 none FALSE            TRUE       5     0.3      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 30 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [56 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [162 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##        0.65    0.1    1 none FALSE            TRUE       5     0.4      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 40 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.02s].
## sorting and recoding items ... [15 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [2 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##        0.65    0.1    1 none FALSE            TRUE       5     0.3      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 30 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [56 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [194 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.6    0.1    1 none FALSE            TRUE       5     0.4      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 40 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [15 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [3 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.6    0.1    1 none FALSE            TRUE       5     0.3      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 30 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [56 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [220 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##        0.55    0.1    1 none FALSE            TRUE       5     0.4      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 40 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [15 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [4 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##        0.55    0.1    1 none FALSE            TRUE       5     0.3      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 30 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [56 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [238 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.5    0.1    1 none FALSE            TRUE       5     0.4      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 40 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [15 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [6 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.5    0.1    1 none FALSE            TRUE       5     0.3      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 30 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [56 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [254 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
# Number of rules found with a support level of 40%
qplot(confidenceLevels, rules_sup04, geom=c("point", "line"), xlab="Confidence level", 
      ylab="Number of rules found",main="Apriori with a support level of 40%"
      ) + 
    theme_bw()

# Create Data frame containing all results
nb_rules <- data.frame(rules_sup04, rules_sup03, confidenceLevels)

# Number of rules found with a support level of 40% and 30%
ggplot(data=nb_rules, aes(x=confidenceLevels)) +
    # Lines and points for rules_sup04
    geom_line(aes(y=rules_sup04, colour="Support level of 40%")) + 
    geom_point(aes(y=rules_sup04, colour="Support level of 40%")) +
    # Lines and points for rules_sup03
    geom_line(aes(y=rules_sup03, colour="Support level of 30%")) +
    geom_point(aes(y=rules_sup03, colour="Support level of 30%")) + 
    # Polishing the graph
    theme_bw() + ylab("") +
    ggtitle("Number of extracted rules with apriori")

# Extract rules with the apriori
rules_movies <- apriori(movie_trx, parameter = list(supp = 0.3, conf = 0.9, minlen = 2, target = "rules"))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.9    0.1    1 none FALSE            TRUE       5     0.3      2
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 30 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [56 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [26 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
# Summary of extracted rules
summary(rules_movies)
## set of 26 rules
## 
## rule length distribution (lhs + rhs):sizes
##  2  3  4 
##  8 15  3 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   2.000   3.000   2.808   3.000   4.000 
## 
## summary of quality measures:
##     support         confidence          lift           count      
##  Min.   :0.3000   Min.   :0.9091   Min.   :1.515   Min.   :30.00  
##  1st Qu.:0.3100   1st Qu.:0.9216   1st Qu.:1.568   1st Qu.:31.00  
##  Median :0.3400   Median :0.9394   Median :2.191   Median :34.00  
##  Mean   :0.3315   Mean   :0.9536   Mean   :2.095   Mean   :33.15  
##  3rd Qu.:0.3500   3rd Qu.:1.0000   3rd Qu.:2.558   3rd Qu.:35.00  
##  Max.   :0.3800   Max.   :1.0000   Max.   :2.632   Max.   :38.00  
## 
## mining info:
##       data ntransactions support confidence
##  movie_trx           100     0.3        0.9
# Create redudant rules and filter from extracted rules
rules_red <- is.redundant(rules_movies)
rules.pruned <- rules_movies[!rules_red]
# Inspect the non-redundant rules with highest confidence
arules::inspect(head(sort(rules.pruned, by="confidence")))
##     lhs                                                     rhs                                                  support confidence     lift count
## [1] {Lord of the Rings: The Two Towers, The}             => {Lord of the Rings: The Fellowship of the Ring, The}    0.38  1.0000000 2.222222    38
## [2] {Lord of the Rings: The Fellowship of the Ring, The,                                                                                          
##      Lord of the Rings: The Return of the King, The}     => {Lord of the Rings: The Two Towers, The}                0.35  1.0000000 2.631579    35
## [3] {Lord of the Rings: The Return of the King, The,                                                                                              
##      Matrix, The}                                        => {Lord of the Rings: The Two Towers, The}                0.31  1.0000000 2.631579    31
## [4] {Lord of the Rings: The Return of the King, The,                                                                                              
##      Matrix, The}                                        => {Lord of the Rings: The Fellowship of the Ring, The}    0.31  1.0000000 2.222222    31
## [5] {Lord of the Rings: The Return of the King, The}     => {Lord of the Rings: The Two Towers, The}                0.35  0.9722222 2.558480    35
## [6] {Lord of the Rings: The Return of the King, The}     => {Lord of the Rings: The Fellowship of the Ring, The}    0.35  0.9722222 2.160494    35
# Plot rules as scatterplot
plot(rules_movies, measure = c("confidence", "lift"), shading = "support", jitter = 1, engine = "html")
# Interactive matrix-based plot
plot(rules_movies, method = "matrix", shading ="confidence", engine = "html")
# Grouped matrix plot of rules
plot(rules_movies, method = "grouped", measure = "lift", shading = "confidence")

# Parallel coordinate plots with confidence as color coding
plot(rules_movies, method = "paracoord", shading = "confidence")

# Plot movie rules as a graph
plot(rules_movies, method = "graph", engine = "htmlwidget")
# Retrieve the top 10 rules with highest confidence
top10_rules_movies = head(sort(rules_movies, by = "confidence"), 10)

# Plot as an interactive graph the top 10 rules
plot(top10_rules_movies, method = "graph", engine = "htmlwidget")
# Extract rules with Pulp Fiction on the right side
pulpfiction_rules_rhs <- apriori(movie_trx, parameter = list(supp = 0.3, conf = 0.5), 
                                 appearance = list(default = "lhs", rhs = "Pulp Fiction")
                                 ) 
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.5    0.1    1 none FALSE            TRUE       5     0.3      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 30 
## 
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [56 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [19 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
# Inspect the first rules
arules::inspect(head(pulpfiction_rules_rhs))
##     lhs                                                     rhs            support confidence     lift count
## [1] {Schindler's List}                                   => {Pulp Fiction}    0.30  0.6818182 1.450677    30
## [2] {Jurassic Park}                                      => {Pulp Fiction}    0.31  0.7209302 1.533894    31
## [3] {Seven (a.k.a. Se7en)}                               => {Pulp Fiction}    0.30  0.8108108 1.725129    30
## [4] {Lord of the Rings: The Fellowship of the Ring, The} => {Pulp Fiction}    0.31  0.6888889 1.465721    31
## [5] {Sixth Sense, The}                                   => {Pulp Fiction}    0.31  0.7045455 1.499033    31
## [6] {Forrest Gump}                                       => {Pulp Fiction}    0.33  0.7857143 1.671733    33
arules::inspect(head(sort(pulpfiction_rules_rhs, by="lift"), 10))
##      lhs                            rhs            support confidence     lift count
## [1]  {Fight Club,                                                                   
##       Silence of the Lambs, The} => {Pulp Fiction}    0.34  0.9189189 1.955147    34
## [2]  {American Beauty,                                                              
##       Silence of the Lambs, The} => {Pulp Fiction}    0.31  0.8857143 1.884498    31
## [3]  {Shawshank Redemption, The,                                                    
##       Silence of the Lambs, The} => {Pulp Fiction}    0.31  0.8857143 1.884498    31
## [4]  {Fight Club,                                                                   
##       Matrix, The}               => {Pulp Fiction}    0.30  0.8333333 1.773050    30
## [5]  {Seven (a.k.a. Se7en)}      => {Pulp Fiction}    0.30  0.8108108 1.725129    30
## [6]  {American Beauty,                                                              
##       Matrix, The}               => {Pulp Fiction}    0.30  0.8108108 1.725129    30
## [7]  {Matrix, The,                                                                  
##       Silence of the Lambs, The} => {Pulp Fiction}    0.32  0.8000000 1.702128    32
## [8]  {American Beauty,                                                              
##       Fight Club}                => {Pulp Fiction}    0.30  0.7894737 1.679731    30
## [9]  {Forrest Gump}              => {Pulp Fiction}    0.33  0.7857143 1.671733    33
## [10] {Silence of the Lambs, The} => {Pulp Fiction}    0.38  0.7600000 1.617021    38
# Extract rules with Pulp Fiction on the left side
pulpfiction_rules_lhs <- apriori(movie_trx, parameter = list(supp = 0.3, conf = 0.5), 
                                 appearance = list(default = "rhs", lhs = "Pulp Fiction")
                                 )
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.5    0.1    1 none FALSE            TRUE       5     0.3      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 30 
## 
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [56 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [16 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
# Summary of extracted rules
summary(pulpfiction_rules_lhs)
## set of 16 rules
## 
## rule length distribution (lhs + rhs):sizes
##  1  2 
##  4 12 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    1.75    2.00    1.75    2.00    2.00 
## 
## summary of quality measures:
##     support         confidence          lift           count      
##  Min.   :0.3000   Min.   :0.5000   Min.   :1.000   Min.   :30.00  
##  1st Qu.:0.3100   1st Qu.:0.6287   1st Qu.:1.234   1st Qu.:31.00  
##  Median :0.3600   Median :0.6596   Median :1.454   Median :36.00  
##  Mean   :0.3887   Mean   :0.6714   Mean   :1.385   Mean   :38.88  
##  3rd Qu.:0.4100   3rd Qu.:0.7553   3rd Qu.:1.538   3rd Qu.:41.00  
##  Max.   :0.6000   Max.   :0.8085   Max.   :1.725   Max.   :60.00  
## 
## mining info:
##       data ntransactions support confidence
##  movie_trx           100     0.3        0.5
# Inspect the first rules
arules::inspect(head(pulpfiction_rules_lhs))
##     lhs               rhs                         support confidence lift    
## [1] {}             => {American Beauty}           0.57    0.5700000  1.000000
## [2] {}             => {Silence of the Lambs, The} 0.50    0.5000000  1.000000
## [3] {}             => {Fight Club}                0.54    0.5400000  1.000000
## [4] {}             => {Matrix, The}               0.60    0.6000000  1.000000
## [5] {Pulp Fiction} => {Schindler's List}          0.30    0.6382979  1.450677
## [6] {Pulp Fiction} => {Jurassic Park}             0.31    0.6595745  1.533894
##     count
## [1] 57   
## [2] 50   
## [3] 54   
## [4] 60   
## [5] 30   
## [6] 31
# Extract rules with Pulp Fiction on the left side
pulpfiction_rules_lhs <- apriori(movie_trx, parameter = list(supp = 0.3, conf = 0.5, minlen = 2), 
                                 appearance = list(default = "rhs", lhs = "Pulp Fiction")
                                 ) 
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.5    0.1    1 none FALSE            TRUE       5     0.3      2
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 30 
## 
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[4508 item(s), 100 transaction(s)] done [0.01s].
## sorting and recoding items ... [56 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [12 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
# Inspect the first rules
arules::inspect(head(pulpfiction_rules_lhs))
##     lhs               rhs                                                  support confidence     lift count
## [1] {Pulp Fiction} => {Schindler's List}                                      0.30  0.6382979 1.450677    30
## [2] {Pulp Fiction} => {Jurassic Park}                                         0.31  0.6595745 1.533894    31
## [3] {Pulp Fiction} => {Seven (a.k.a. Se7en)}                                  0.30  0.6382979 1.725129    30
## [4] {Pulp Fiction} => {Lord of the Rings: The Fellowship of the Ring, The}    0.31  0.6595745 1.465721    31
## [5] {Pulp Fiction} => {Sixth Sense, The}                                      0.31  0.6595745 1.499033    31
## [6] {Pulp Fiction} => {Forrest Gump}                                          0.33  0.7021277 1.671733    33

Analyzing Social Media Data in R

Chapter 1 - Understanding Twitter Data

Analyzing Twitter Data:

  • Social media analysis is the process of collecting and analyzing social media data to derive insights
  • Can use stream_tweets() to sample 1% of the data from 30 seconds of tweets to a data frame
    • Can use stream_tweets("", timeout=60) to extend to 60 seconds
  • There are many applications ot using tweets - sentiments, events, outbreaks, etc.
  • The hashtags make it easy to follow topics on Twitter

Extracting Twitter Data:

  • API (Application Programming Interface) allows applications to talk with each other and exchange information
  • Different levels of Twitter API provide different levels of access intended for different types of users
    • Standard API - free and includes only the last 7 days of tweets
  • Preprequisites for running twitter analysis in R includes
    • Twitter account
    • Pop-up blocker disabled
    • Interactive R session opened
    • rtweet and httpuv installed
  • The search_tweets() function will retrieve a maximum of 18,000 tweets from the past 7 days
    • The first argument is the search term
    • Can add n=, include_rts=, lang=‘en’, etc.
  • The get_timeline() function will return up to 3,200 tweets posted by a specific user
    • get_timeline(‘userName’, n=)

Components of Twitter Data:

  • A tweet can have over 150 metadata components, stored using JSON format
  • The rtweet library converts the Twitter JSON to a data.frame
    • Each attribute becomes a column
    • screen_name is the Twitter handle
  • Can use the lookup_users(users) to get the follower data
    • tvseries <- lookup_users(“GameOfThrones”, “fleabag”, “BreakingBad”)
    • user_df <- tvseries[, c(“screen_name”, “followers_count”)]
  • Can grab “retweet_text” column from the data.frame to get the retweet counts

Example code includes:

# Extract live tweets for 120 seconds window
# tweets120s <- rtweet::stream_tweets("", timeout = 120)

# View dimensions of the data frame with live tweets
# dim(tweets120s)


# Extract tweets on "#Emmyawards" and include retweets
# twts_emmy <- rtweet::search_tweets("#Emmyawards", n = 2000, include_rts = TRUE, lang = "en")

# View output for the first 5 columns and 10 rows
# head(twts_emmy[,1:5], 10)


# Extract tweets posted by the user @Cristiano
# get_cris <- rtweet::get_timeline("@Cristiano", n = 3200)

# View output for the first 5 columns and 10 rows
# head(get_cris[,1:5], 10)


tweets_ai <- read_csv("./RInputFiles/tweets_ai.xls")
str(tweets_ai)


# Create a table of users and tweet counts for the topic
sc_name <- table(tweets_ai$screen_name)

# Sort the table in descending order of tweet counts
sc_name_sort <- sort(sc_name, decreasing = TRUE)

# View sorted table for top 10 users
head(sc_name_sort, 10)


# Extract user data for the twitter accounts of 4 news sites
# users <- rtweet::lookup_users("nytimes", "CNN", "FoxNews", "NBCNews")

# Create a subset data frame of screen names and follower counts
# user_df <- users[,c("screen_name","followers_count")]

# Display and compare the follower counts for the 4 news sites
# user_df


# Create a data frame of tweet text and retweet count
rtwt <- tweets_ai[,c("text", "retweet_count")]
head(rtwt)

# Sort data frame based on descending order of retweet counts
rtwt_sort <- arrange(rtwt, desc(retweet_count))

# Exclude rows with duplicate text from sorted data frame
rtwt_unique <- unique(rtwt_sort, by = "text")

# Print top 6 unique posts retweeted most number of times
rownames(rtwt_unique) <- NULL
head(rtwt_unique)

Chapter 2 - Analyzing Twitter Data

Filtering Tweets:

  • Filtering is necessary due to the very large volume of tweets
  • Can use the -filter to extract original tweets
    • search_tweets(“digital_marketing”, n=100)
    • search_tweets(“digital_marketing -filter:retweets -filter:quote -filter:replies”, n=100)
  • Can extract only a minimum number of retweets and favorites
    • search_tweets(“bitcoin min_faves:100 AND min_retweets:100”)

Twitter User Analysis:

  • Can use Twitter to identify influencers and other key individuals for marketing
    • Followers are users following a specific user - followers_count
    • Friends are people the specific user is following - friends_count
    • The golden ratio is defined as followers divided by friends
  • Can use the lists_users() command to get Twitter lists and lists_subscribers() to get the associated subscribers
    • lists_users(“myUser”)
    • lists_subscribers(slug=“gaming”, owner_user=“myList”, n=100)
    • lookup_users(users)

Twitter Trends:

  • Trending topics can provide a roadmap for areas for increasing engagement
    • trend_topics <- get_trends()
    • trends_available() # list of cities and countries
    • trend_topics <- get_trends(“United States”)
    • trend_topics <- get_trends(“New York”)
  • The data for the ‘tweet_volume’ column is available only for some trends

Plotting Twitter Data Over Time:

  • Can create time series data from the Twitter data
    • ts_plot(df, by=“hours”, col=“blue”) # by= is the unit of time to use for creating the plot - seconds, minutes, hours, days, etc.
    • ts_data(df, by=“minutes”)

Example code includes:

# DO NOT RUN

# Extract 100 original tweets on "Superbowl"
tweets_org <- search_tweets("Superbowl -filter:retweets -filter:quote -filter:replies", n = 100)

# Check for presence of replies
count(tweets_org$reply_to_screen_name)

# Check for presence of quotes
count(tweets_org$is_quote)

# Check for presence of retweets
count(tweets_org$is_retweet)


# Extract tweets on "Apple iphone" in French
tweets_french <- search_tweets("Apple iphone", lang = "fr")

# View the tweets
head(tweets_french$text)

# View the tweet metadata showing the language
head(tweets_french$lang)


# Extract tweets with a minimum of 100 retweets and 100 favorites
tweets_pop <- search_tweets("Chelsea min_retweets:100 AND min_faves:100")

# Create a data frame to check retweet and favorite counts
counts <- tweets_pop[c("retweet_count", "favorite_count")]
head(counts)

# View the tweets
head(tweets_pop$text)


# Extract user information of people who have tweeted on the topic
user_cos <- users_data(tweet_cos)

# View few rows of user data
head(user_cos)

# Aggregate screen name, follower and friend counts
counts_df <- user_cos %>%
    group_by(screen_name) %>%
    summarize(follower = mean(followers_count), friend = mean(friends_count))

# View the output
head(counts_df)


# Calculate and store the golden ratio
counts_df$ratio <- counts_df$follower/counts_df$friend

# Sort the data frame in decreasing order of follower count
counts_sort <- arrange(counts_df, desc(follower))

# View the first few rows
head(counts_sort)

# Select rows where the follower count is greater than 50000
counts_sort[counts_sort$follower>50000,]

# Select rows where the follower count is less than 1000
counts_sort[counts_sort$follower<1000,]


# Get topics trending in Canada
gt_country <- get_trends("Canada")

# View the first 6 columns
head(gt_country[,1:6])


# Get topics trending in London
gt_city <- get_trends("London")

# View the first 6 columns
head(gt_city[,1:6])

# Aggregate the trends and tweet volumes
trend_df <- gt_city %>%
    group_by(trend) %>%
    summarize(tweet_vol = mean(tweet_volume))

# Sort data frame on descending order of tweet volumes and print header
trend_df_sort <- arrange(trend_df, desc(tweet_vol))
head(trend_df_sort,10)


# Extract tweets on #walmart and exclude retweets
walmart_twts <- search_tweets("#walmart", n = 18000, include_rts = FALSE)

# View the output
head(walmart_twts)

# Create a time series plot
ts_plot(walmart_twts, by = "hours", color = "blue")


# Create a time series object for Puma at hourly intervals
puma_ts <- ts_data(puma_st, by ='hours')

# Rename the two columns in the time series object
names(puma_ts) <- c("time", "puma_n")

# View the output
head(puma_ts)

# Create a time series object for Nike at hourly intervals
nike_ts <- ts_data(nike_st, by ='hours')

# Rename the two columns in the time series object
names(nike_ts) <- c("time", "nike_n")

# View the output
head(nike_ts)


# Merge the two time series objects and retain "time" column
merged_df <- merge(puma_ts, nike_ts, by = "time", all = TRUE)
head(merged_df)

# Stack the tweet frequency columns
melt_df <- melt(merged_df, na.rm = TRUE, id.vars = "time")

# View the output
head(melt_df)

# Plot frequency of tweets on Puma and Nike
ggplot(data = melt_df, aes(x = time, y = value, col = variable))+
    geom_line(lwd = 0.8)

Chapter 3 - Visualize Tweet Texts

Processing Twitter Text:

  • Processing tweet text helps derive insights from the stream
  • Steps in text processing include
    • Remove redundant information
    • Convert text to corpus (list of text documents)
    • Convert the corpus to be all-lowercase
    • Remove stopwords from the corpus
  • Example of processing fields in twt_txt, a character vector of tweet texts
    • twt_txt_url <- qdapRegex::rm_twitter_url(twt_txt)
    • twt_txt_chrs <- gsub(“[^A-Za-z]”, " ", twt_txt_url) # leaves only upper and lower characters - removes numbers, punctuation, and special characters
    • twt_corpus <- twt_txt_chrs %>% VectorSource() %>% Corpus()
    • twt_corpus_lower <- tm_map(twt_corpus, tolower)
    • twt_corpus_stopwd <- tm_map(twt_corpus_lower, removeWords, stopwords(“english”))
    • twt_corpus_final <- tm_map(twt_corpus_stopwd, stripWhitespace)

Visualize Popular Terms:

  • Can extract the most common words (after pre-processing) and then plot them
    • freq_terms(corpus, n)
  • Can create custom stop words and remove them
    • custom_stop <- c()
    • tm_map(corpus, removeWords, custom_stop)
  • Can create word clouds to visualize frequent terms
    • wordcloud(corpus, min.freq=, colors=, scale=, random.order=FALSE)

Topic Modeling of Tweets:

  • Topic modeling is the process of automatically discovering topics from a corpus
  • Can use LDA (mathematical model for assessing mixtures of topics with words and mixture of documents with topics)
  • The document-term-matrix (DTM) is a mix of documents and terms
    • dtm <- DocumentTermMatrix(corpus)
    • inspect(dtm)
    • rowTotals <- apply(dtm, 1, sum)
    • tweet_dtm_new <- dtm[rowTotals > 0]
  • Can then use topicmodels::LDA() to extract the topics
    • library(topicmodels)
    • lda_5 <- LDA(tweet_dtm_new, k=5)
    • top10_terms <- terms(lda_5, 10)

Twitter Sentiment Analysis:

  • Sentiment analysis is the process of understanding perceptions (positive, neutral, negative, joy, anger, etc.) from a text
  • The syuzhet package contains sentiment mapping functions and files

Example code includes:

twt_telmed <- readRDS("./RInputFiles/tweets_telmed.rds")
dim(twt_telmed)


# Extract tweet text from the pre-loaded dataset
twt_txt <- twt_telmed$text
head(twt_txt)

# Remove URLs from the tweet text and view the output
twt_txt_url <- qdapRegex::rm_twitter_url(twt_txt)
head(twt_txt_url)

# Replace special characters, punctuation, & numbers with spaces
twt_txt_chrs  <- gsub("[^A-Za-z]"," " , twt_txt_url)

# View text after replacing special characters, punctuation, & numbers
head(twt_txt_chrs)


# Convert text in "twt_gsub" dataset to a text corpus and view output
twt_corpus <- twt_txt_chrs %>% 
    tm::VectorSource() %>% 
    tm::Corpus()

head(twt_corpus$content)

# Convert the corpus to lowercase
twt_corpus_lwr <- tm::tm_map(twt_corpus, tolower) 

# View the corpus after converting to lowercase
head(twt_corpus_lwr$content)


# Remove English stop words from the corpus and view the corpus
twt_corpus_stpwd <- tm::tm_map(twt_corpus_lwr, tm::removeWords, tm::stopwords("english"))
head(twt_corpus_stpwd$content)

# Remove additional spaces from the corpus
twt_corpus_final <- tm::tm_map(twt_corpus_stpwd, tm::stripWhitespace)

# View the text corpus after removing spaces
head(twt_corpus_final$content)


# Extract term frequencies for top 60 words and view output
termfreq  <-  qdap::freq_terms(twt_corpus_final, 60)
termfreq

# Create a vector of custom stop words
custom_stopwds <- c("telemedicine", " s", "amp", "can", "new", "medical", "will", "via", "way",  "today", "come", "t", "ways", "say", "ai", "get", "now")

# Remove custom stop words and create a refined corpus
corp_refined <- tm::tm_map(twt_corpus_final, tm::removeWords, custom_stopwds) 

# Extract term frequencies for the top 20 words
termfreq_clean <- qdap::freq_terms(corp_refined, 20)
termfreq_clean


# Extract term frequencies for the top 10 words
termfreq_10w <- qdap::freq_terms(corp_refined, 10)
termfreq_10w

# Identify terms with more than 60 counts from the top 10 list
term60 <- subset(termfreq_10w, FREQ > 60)

# Create a bar plot using terms with more than 60 counts
ggplot(term60, aes(x = reorder(WORD, -FREQ), y = FREQ)) + 
    geom_bar(stat = "identity", fill = "red") + 
    theme(axis.text.x = element_text(angle = 15, hjust = 1))

# Extract term frequencies for the top 25 words
termfreq_25w <- qdap::freq_terms(corp_refined, 25)
termfreq_25w

# Identify terms with more than 50 counts from the top 25 list
term50 <- subset(termfreq_25w, FREQ > 50)
term50

# Create a bar plot using terms with more than 50 counts
ggplot(term50, aes(x = reorder(WORD, -FREQ), y = FREQ)) + 
    geom_bar(stat = "identity", fill = "blue") + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1))


# Create a word cloud in red with min frequency of 20
wordcloud::wordcloud(corp_refined, min.freq = 20, colors = "red", scale = c(3, 0.5),random.order = FALSE)

# Create word cloud with 6 colors and max 50 words
wordcloud::wordcloud(corp_refined, max.words = 50, colors = RColorBrewer::brewer.pal(6, "Dark2"), 
                     scale=c(4, 1), random.order = FALSE
                     )


# Create a document term matrix (DTM) from the pre-loaded corpus
# dtm_climate <- tm::DocumentTermMatrix(corpus_climate)
# dtm_climate

# Find the sum of word counts in each document
# rowTotals <- apply(dtm_climate, 1, FUN=sum)
# head(rowTotals)

# Select rows with a row total greater than zero
# dtm_climate_new <- dtm_climate[rowTotals > 0, ]
# dtm_climate_new


# Create a topic model with 5 topics
# topicmodl_5 <- topicmodels::LDA(dtm_climate_new, k = 5)

# Select and view the top 10 terms in the topic model
# top_10terms <- terms(topicmodl_5, 10)
# top_10terms 

# Create a topic model with 4 topics
# topicmodl_4 <- topicmodels::LDA(dtm_climate_new, k = 4)

# Select and view the top 6 terms in the topic model
# top_6terms <- terms(topicmodl_4, 6)
# top_6terms 


# Perform sentiment analysis for tweets on `Climate change` 
# sa.value <- syuzhet::get_nrc_sentiment(tweets_cc$text)

# View the sentiment scores
# head(sa.value, 10)


# Calculate sum of sentiment scores
# score <- colSums(sa.value[,])

# Convert the sum of scores to a data frame
# score_df <- data.frame(score)

# Convert row names into 'sentiment' column and combine with sentiment scores
# score_df2 <- cbind(sentiment = row.names(score_df), score_df, row.names = NULL)
# print(score_df2)

# Plot the sentiment scores
# ggplot(data = score_df2, aes(x = sentiment, y = score, fill = sentiment)) +
#     geom_bar(stat = "identity") +
#     theme(axis.text.x = element_text(angle = 45, hjust = 1))

Chapter 4 - Network Analysis and Mapping

Twitter Network Analysis:

  • Network analysis is the process of mapping network objects to understand interdependicies and information flow
    • Nodes/vertices are the objects
    • Edges are the connections between the objects - can be directed or undirected
  • Example for the retweet network of #OOTD
    • twts_OOTD <- search_tweets(“#OOTD”, n=18000, include_rts=TRUE)
    • rt_df <- twts_OOTD[, c(“screen_name”, “retweet_screen_name”)]
    • rt_df_new <- rt_df[complete.cases(rt_df), ]
    • matrx <- as.matrix(rt_df_new)
    • nw_rtweet <- igraph::graph_from_edgelist(el=matrx, directed=TRUE)

Network Centrality Measures:

  • Centrality measures include degree centrality and betweenness
  • Degree centrality is the number of edges for a vertex
    • Out-degree is the number of outward edges (e.g., number of times user retweets)
    • In-degree is the number of inward edges (e.g., number of times user is retweeted)
    • out_deg <- degree(nw_rtweet, “userName”, mode=c(“out”)) # can skip the “userName” and get for all users
    • in_deg <- degree(nw_rtweet, “userName”, mode=c(“in”)) # can skip “userName” and get for all users
  • Betweeness is the degree to which nodes link each other (high means more control over the network)
    • between_nw <- betweenness(nw_rtweet, directed=TRUE)

Visualizing Twitter Networks:

  • Can visualize plots using plot.igraph() with optional arguments
    • asp= is the aspect ratio (9/16 is rectangle)
    • vertex.size
    • vertex.color
    • edge.arrow.size
    • edge.color
    • vertex.label.cex
    • vertex.label.color

Mapping Twitter Data:

  • Can use geographic metadata from tweets for geo-coding and mapping
    • Place is selected by the user from a pre-defined list on Twitter and consist of a bounding box
    • Precise location is the specific lat/lon from GPS enabled devices (only around 1% of tweets have this level of geo-tagging)
  • Can use the rtweet library to find the geographic coding
    • pol_coord <- lat_lng(pol) # pol is tweet data
    • map(database=“state”, fill=TRUE, col=“light yellow”)
    • with(pol_geo, points(lng, lat, pch=20, cex=1, col=“blue”))

Wrap Up:

  • Tweet components and extraction
  • Filtering and analyzing tweets
  • Word clouds, sentiment analysis, and visualization
  • Network analysis and geocoding/mapping

Example code includes:

# DO NOT RUN - DO NOT HAVE DATASET

# Extract source vertex and target vertex from the tweet data frame
rtwt_df <- twts_trvl[, c("screen_name" , "retweet_screen_name")]

# View the data frame
head(rtwt_df)

# Remove rows with missing values
rtwt_df_new <- rtwt_df[complete.cases(rtwt_df), ]

# Create a matrix
rtwt_matrx <- as.matrix(rtwt_df_new)
head(rtwt_matrx)


# Convert the matrix to a retweet network
nw_rtweet <- graph_from_edgelist(el = rtwt_matrx, directed = TRUE)

# View the retweet network
print.igraph(nw_rtweet)


# Calculate out-degree scores from the retweet network
out_degree <- degree(nw_rtweet, mode = c("out"))

# Sort the out-degree scores in decreasing order
out_degree_sort <- sort(out_degree, decreasing = TRUE)

# View users with the top 10 out-degree scores
out_degree_sort[1:10]


# Compute the in-degree scores from the retweet network
in_degree <- degree(nw_rtweet, mode = c("in"))

# Sort the out-degree scores in decreasing order
in_degree_sort <- sort(in_degree, decreasing = TRUE)

# View users with the top 10 in-degree scores
in_degree_sort[1:10]


# Calculate the betweenness scores from the retweet network
betwn_nw <- betweenness(nw_rtweet, directed = TRUE)

# Sort betweenness scores in decreasing order and round the values
betwn_nw_sort <- betwn_nw %>%
    sort(decreasing = TRUE) %>%
    round()

# View users with the top 10 betweenness scores 
betwn_nw_sort[1:10]


# Create a basic network plot
plot.igraph(nw_rtweet)

# Create a network plot with formatting attributes
set.seed(1234)
plot(nw_rtweet, asp = 9/12, vertex.size = 10, vertex.color = "green", edge.arrow.size = 0.5, 
     edge.color = "black", vertex.label.cex = 0.9, vertex.label.color = "black"
     )


# Create a variable for out-degree
deg_out <- degree(nw_rtweet, mode = c("out"))
deg_out

# Amplify the out-degree values
vert_size <- (deg_out * 3) + 5

# Set vertex size to amplified out-degree values
set.seed(1234)
plot(nw_rtweet, asp = 10/11, vertex.size = vert_size, vertex.color = "lightblue", 
     edge.arrow.size = 0.5, edge.color = "grey", vertex.label.cex = 0.8, vertex.label.color = "black"
     )


# Create a column and categorize follower counts above and below 500
followers$follow <- ifelse(followers$followers_count > 500, "1", "0")
head(followers)

# Assign the new column as vertex attribute to the retweet network
V(nw_rtweet)$followers <- followers$follow
vertex_attr(nw_rtweet)

# Set the vertex colors based on follower count and create a plot
sub_color <- c("lightgreen", "tomato")
plot(nw_rtweet, asp = 9/12, vertex.size = vert_size, edge.arrow.size = 0.5, vertex.label.cex = 0.8,
     vertex.color = sub_color[as.factor(vertex_attr(nw_rtweet, "followers"))], 
     vertex.label.color = "black", vertex.frame.color = "grey"
     )


# Extract tweets using search_tweets()
vegan <- search_tweets("#vegan", n = 18000)

# Extract geo-coordinates data to append as new columns
vegan_coord <- lat_lng(vegan)

# View the columns with geo-coordinates for first 20 tweets
head(vegan_coord[c("lat", "lng")], 20)


# Omit rows with missing geo-coordinates in the data frame
vegan_geo <- na.omit(vegan_coord[,c("lat", "lng")])

# View the output
head(vegan_geo)

# Plot longitude and latitude values of tweets on the US state map
map(database = "state", fill = TRUE, col = "light yellow")
with(vegan_geo, points(lng, lat, pch = 20, cex = 1, col = 'blue'))

# Plot longitude and latitude values of tweets on the world map
map(database = "world", fill = TRUE, col = "light yellow")
with(vegan_geo, points(lng, lat, pch = 20, cex = 1, col = 'blue')) 

Building Web Applications with Shiny in R

Chapter 1 - Get Started with Shiny

Introduction to Shiny:

  • Shiny is an R package that allow for creating interactive applications and graphics
  • Web aplications have a user interface (UI) that updates the display based on an app in the server

Build a “Hello World” Shiny App:

  • Shiny can be loaded like any other R package - library(shiny)
  • Example shell for a Shiny app
    • library(shiny)
    • ui <- fluidPage()
    • server <- function(input, output, session) {}
    • shinyApp(ui=ui, server=server)

Build a babynames explorer Shiny App:

  • Begin by sketching out the desired look and usability of the app
    • Add UI (inputs and interface)
    • Add Server
  • The user interface can have many components
    • titlePanel()
    • textInput()
    • textOutput() # use the quoted variable name from server as the argument
    • plotOutput() # use the quoted variable name from server as the argument
    • sidebarPanel()
    • mainPanel()
    • sidebarLayout()

Example code includes:

library(shiny)


ui <- fluidPage(
  # CODE BELOW: Add a text input "name"
  textInput("name", "Enter your name: ")
)

server <- function(input, output) {
}

shinyApp(ui = ui, server = server)


ui <- fluidPage(
  textInput("name", "What is your name?"), 
  # CODE BELOW: Display the text output, greeting
  textOutput("greeting")
)

server <- function(input, output) {
  # CODE BELOW: Render a text output, greeting
  output$greeting <- renderText({paste0("Hello, ", input$name)})
}

shinyApp(ui = ui, server = server)


ui <- fluidPage(
  # CODE BELOW: Add a text input "name"
  textInput("name", "Enter Your Name", "David")
)
server <- function(input, output, session) {
}
shinyApp(ui = ui, server = server)


ui <- fluidPage(
  textInput('name', 'Enter Name', 'David'), 
  # CODE BELOW: Display the plot output named 'trend'
  plotOutput("trend")
)
server <- function(input, output, session) {
  # CODE BELOW: Render an empty plot and assign to output named 'trend'
  output$trend <- renderPlot({ggplot()})
}
shinyApp(ui = ui, server = server)


ui <- fluidPage(
  titlePanel("Baby Name Explorer"),
  sidebarLayout(
    sidebarPanel(textInput('name', 'Enter Name', 'David')),
    mainPanel(plotOutput('trend'))
  )
)

server <- function(input, output, session) {
  output$trend <- renderPlot({
    # CODE BELOW: Update to display a line plot of the input name
    babynames::babynames %>% 
      filter(name==input$name) %>% 
      ggplot(aes(x=year, y=prop, color=sex)) + 
      geom_line()
  })
}

shinyApp(ui = ui, server = server)

Chapter 2 - Inputs, Outputs, and Layouts

Inputs:

  • Can create many types of inputs - text, slider, select, numerical, daterange, etc.
    • selectInput(“name”, “make selection”, choices=c(“a”, “b”, “c”))
    • sliderInput(“name”, “make selection”, value=1925, min=1900, max=2000)
  • Need to give every input a unique name so that it can be called in the server function

Outputs:

  • The render functions build ouptuts as functions of inputs and other factors
    • renderText({})
    • renderTable()
    • renderImage()
    • renderPlot()
  • Can then show outputs in the UI by using the appropriate textOutput() function
  • Can use html widgets and associated packages to render Shiny outputs - example for data.table in package DT

Layouts and Themes:

  • The default Shiny App layout can be customized as needed
  • The sidebarLayout() is a common over-ride, where there is a side panel and a main panel
  • The tabLayout() allows for multiple tabs, and is a subset of sidebarLayout
    • There must be a tabsetPanel() inside, with each containing one or more tabPanel()
  • Can allow users to select themes using shinythemes::themeSelector()
    • theme=shinythemes::shinytheme(‘superhero’) # if inside the UI, will apply theme ‘superhero’

Building Apps:

  • Example of creating an app based on gapminder data with selections for continent and year
    • Add inputs (UI)
    • Add outputs (UI/Server)
    • Update layout (UI)
    • Connect Server and UI with appropriate functions

Example code includes:

ui <- fluidPage(
  titlePanel("What's in a Name?"),
  # CODE BELOW: Add select input named "sex" to choose between "M" and "F"
  selectInput("sex", "Select Sex", selected="F", choices=c("F", "M")), 
  sliderInput("year", "Select Year", value=1900, min=1900, max=2010), 
  # Add plot output to display top 10 most popular names
  plotOutput('plot_top_10_names')
)

server <- function(input, output, session){
  # Render plot of top 10 most popular names
  output$plot_top_10_names <- renderPlot({
    # Get top 10 names by sex and year
    top_10_names <- babynames::babynames %>% 
      # MODIFY CODE BELOW: Filter for the selected sex
      filter(sex == input$sex) %>% 
      filter(year == input$year) %>% 
      top_n(10, prop)
    # Plot top 10 names by sex and year
    ggplot(top_10_names, aes(x = name, y = prop)) +
      geom_col(fill = "#263e63")
  })
}

shinyApp(ui = ui, server = server)


ui <- fluidPage(
  titlePanel("What's in a Name?"),
  # Add select input named "sex" to choose between "M" and "F"
  selectInput('sex', 'Select Sex', choices = c("F", "M")),
  # Add slider input named "year" to select year between 1900 and 2010
  sliderInput('year', 'Select Year', min = 1900, max = 2010, value = 1900),
  # CODE BELOW: Add table output named "table_top_10_names"
  tableOutput("table_top_10_names")
)

server <- function(input, output, session){
  # Function to create a data frame of top 10 names by sex and year 
  top_10_names <- function(){
    top_10_names <- babynames::babynames %>% 
      filter(sex == input$sex) %>% 
      filter(year == input$year) %>% 
      top_n(10, prop)
  }
  # CODE BELOW: Render a table output named "table_top_10_names"
  output$table_top_10_names <- renderTable({top_10_names()})
}

shinyApp(ui = ui, server = server)


ui <- fluidPage(
  titlePanel("What's in a Name?"),
  # Add select input named "sex" to choose between "M" and "F"
  selectInput('sex', 'Select Sex', choices = c("M", "F")),
  # Add slider input named "year" to select year between 1900 and 2010
  sliderInput('year', 'Select Year', min = 1900, max = 2010, value = 1900),
  # MODIFY CODE BELOW: Add a DT output named "table_top_10_names"
  DT::DTOutput('table_top_10_names')
)
server <- function(input, output, session){
  top_10_names <- function(){
    babynames::babynames %>% 
      filter(sex == input$sex) %>% 
      filter(year == input$year) %>% 
      top_n(10, prop)
  }
  # MODIFY CODE BELOW: Render a DT output named "table_top_10_names"
  output$table_top_10_names <- DT::renderDT({
    DT::datatable(top_10_names())
  })
}
shinyApp(ui = ui, server = server)


top_trendy_names <- data.frame(name=c('Kizzy', 'Deneen', 'Royalty', 'Mareli', 'Moesha', 'Marely', 'Kanye', 'Tennille', 'Aitana', 'Kadijah', 'Shaquille', 'Catina', 'Allisson', 'Emberly', 'Nakia', 'Jaslene', 'Kyrie', 'Akeelah', 'Zayn', 'Talan'), stringsAsFactors=FALSE)

ui <- fluidPage(
  selectInput('name', 'Select Name', top_trendy_names$name),
  # CODE BELOW: Add a plotly output named 'plot_trendy_names'
  plotly::plotlyOutput("plot_trendy_names")
)

server <- function(input, output, session){
  # Function to plot trends in a name
  plot_trends <- function(){
     babynames::babynames %>% 
      filter(name == input$name) %>% 
      ggplot(aes(x = year, y = n)) +
      geom_col()
  }
  # CODE BELOW: Render a plotly output named 'plot_trendy_names'
  output$plot_trendy_names <- plotly::renderPlotly({plot_trends()})
}

shinyApp(ui = ui, server = server)


ui <- fluidPage(
    # MODIFY CODE BELOW: Wrap in a sidebarLayout
    sidebarLayout(
        # MODIFY CODE BELOW: Wrap in a sidebarPanel
        sidebarPanel(selectInput('name', 'Select Name', top_trendy_names$name)),
        # MODIFY CODE BELOW: Wrap in a mainPanel
        mainPanel(plotly::plotlyOutput('plot_trendy_names'), DT::DTOutput('table_trendy_names'))
    )
)

# DO NOT MODIFY
server <- function(input, output, session){
  # Function to plot trends in a name
  plot_trends <- function(){
     babynames::babynames %>% 
      filter(name == input$name) %>% 
      ggplot(aes(x = year, y = n)) +
      geom_col()
  }
  output$plot_trendy_names <- plotly::renderPlotly({
    plot_trends()
  })
  
  output$table_trendy_names <- DT::renderDT({
    babynames::babynames %>% 
      filter(name == input$name)
  })
}

shinyApp(ui = ui, server = server)


ui <- fluidPage(
    sidebarLayout(
        sidebarPanel(selectInput('name', 'Select Name', top_trendy_names$name)),
        mainPanel(
            # MODIFY CODE BLOCK BELOW: Wrap in a tabsetPanel
            tabsetPanel(
                # MODIFY CODE BELOW: Wrap in a tabPanel providing an appropriate label
                tabPanel("Plot", plotly::plotlyOutput('plot_trendy_names')),
                # MODIFY CODE BELOW: Wrap in a tabPanel providing an appropriate label
                tabPanel("Table", DT::DTOutput('table_trendy_names'))
            )
        )
    )
)

server <- function(input, output, session){
  # Function to plot trends in a name
  plot_trends <- function(){
     babynames::babynames %>% 
      filter(name == input$name) %>% 
      ggplot(aes(x = year, y = n)) +
      geom_col()
  }
  output$plot_trendy_names <- plotly::renderPlotly({
    plot_trends()
  })
  
  output$table_trendy_names <- DT::renderDT({
    babynames::babynames %>% 
      filter(name == input$name)
  })
}

shinyApp(ui = ui, server = server)


ui <- fluidPage(
  # CODE BELOW: Add a titlePanel with an appropriate title
  titlePanel("Trendy Names"), 
  # REPLACE CODE BELOW: with theme = shinythemes::shinytheme("<your theme>")
  theme = shinythemes::shinytheme("spacelab"),
  sidebarLayout(
    sidebarPanel(
      selectInput('name', 'Select Name', top_trendy_names$name)
    ),
    mainPanel(
      tabsetPanel(
        tabPanel('Plot', plotly::plotlyOutput('plot_trendy_names')),
        tabPanel('Table', DT::DTOutput('table_trendy_names'))
      )
    )
  )
)
server <- function(input, output, session){
  # Function to plot trends in a name
  plot_trends <- function(){
     babynames::babynames %>% 
      filter(name == input$name) %>% 
      ggplot(aes(x = year, y = n)) +
      geom_col()
  }
  output$plot_trendy_names <- plotly::renderPlotly({
    plot_trends()
  })
  
  output$table_trendy_names <- DT::renderDT({
    babynames::babynames %>% 
      filter(name == input$name)
  })
}
shinyApp(ui = ui, server = server)


ui <- fluidPage(
    selectInput("greeting", "Select Greeting", selected="Hello", choices=c("Hello", "Bonjour")), 
    textInput("name", "Enter Your Name"),
    textOutput("greeting")
)

server <- function(input, output, session) {
    output$greeting <- renderText({paste0(input$greeting, ", ", input$name)})
}

shinyApp(ui = ui, server = server)


get_top_names <- function(.year, .sex) {
  babynames::babynames %>% 
    filter(year == .year) %>% 
    filter(sex == .sex) %>% 
    top_n(10) %>% 
    mutate(name = forcats::fct_inorder(name))
}

ui <- fluidPage(
    titlePanel("Most Popular Names"), 
    sidebarLayout(
        sidebarPanel(
            selectInput("sex", "Select Sex", selected="M", choices=c("M", "F")), 
            sliderInput("year", "Select Year", value=1900, min=1880, max=2017)
        ), 
        mainPanel(
            plotOutput("popular")
        )
    )
)

server <- function(input, output, session) {
    output$popular <- renderPlot({ get_top_names(input$year, input$sex) %>% 
                                      ggplot(aes(x=name, y=prop)) + 
                                      geom_col()
                                })
}

shinyApp(ui = ui, server = server)


ui <- fluidPage(
    titlePanel("Most Popular Names"), 
    sidebarLayout(
        sidebarPanel(
            selectInput("sex", "Select Sex", selected="M", choices=c("M", "F")), 
            sliderInput("year", "Select Year", value=1900, min=1880, max=2017)
        ), 
        mainPanel(
            tabsetPanel(
                tabPanel("Plot", plotOutput("popular")), 
                tabPanel("Table", DT::DTOutput("table"))
            )
        )
    )
)

server <- function(input, output, session) {
    output$popular <- renderPlot({ get_top_names(input$year, input$sex) %>% 
                                      ggplot(aes(x=name, y=prop)) + 
                                      geom_col()
                                })
    output$table <- DT::renderDT({ get_top_names(input$year, input$sex) })
}

shinyApp(ui = ui, server = server)

Chapter 3 - Reactive Programming

Reactivity 101:

  • Reactive programming updates any time an input is updated (typically, a user input)
  • Reactive conductors are intermediates that either 1) depend on reactive sources, or 2) update a reactive endpoint
  • Reactive expressions are lazy and cached, which can be beneficial to avoid working multiple times
    • Reactive expressions are called only when the value of a source changes and the endpoint requires the reactive expression

Observers vs Reactives:

  • Reactive flow connects reactive components to create an application
  • Observers can access reactive sources but do not return values
    • Observers are typically called for side effects such as sending data to the web server
    • observe({ … })
  • Reactives calculate values without side effects (return values, lazy, no side effects)
  • Observers are called for the side effects (no return values, responsive, side effects)

Stop-Delay-Trigger:

  • By default, any changes to inputs will drive changes to the outputs
  • The function isolate() can override the default behavior - any reactive wrapped in isolate() does not trigger automatic actions
  • Can use eventReactive(input$button, { }) to specify an event such as a button press that should drive the action
  • The observeEvent() is similar to a eventReactive() but acts in response to a user-action
    • The observeEvent() is called ONLY for side effects; the observer equivalent to eventReactive()

Applying Reactivity Concepts:

  • Reactives include sources (input\(), conductors, and endpoints (output\))
    • Conductors are often useful for lengthy calculations, especially when Stop-Delay-Trigger are applied

Example code includes:

server <- function(input, output, session) {
  # CODE BELOW: Add a reactive expression rval_bmi to calculate BMI
  rval_bmi <- reactive({ input$weight/(input$height^2) })
  output$bmi <- renderText({
    # MODIFY CODE BELOW: Replace right-hand-side with reactive expression
    bmi <- rval_bmi()
    paste("Your BMI is", round(bmi, 1))
  })
  output$bmi_range <- renderText({
    # MODIFY CODE BELOW: Replace right-hand-side with reactive expression
    bmi <- rval_bmi()
    bmi_status <- cut(bmi, 
      breaks = c(0, 18.5, 24.9, 29.9, 40),
      labels = c('underweight', 'healthy', 'overweight', 'obese')
    )
    paste("You are", bmi_status)
  })
}

ui <- fluidPage(
  titlePanel('BMI Calculator'),
  sidebarLayout(
    sidebarPanel(
      numericInput('height', 'Enter your height in meters', 1.5, 1, 2),
      numericInput('weight', 'Enter your weight in Kilograms', 60, 45, 120)
    ),
    mainPanel(
      textOutput("bmi"),
      textOutput("bmi_range")
    )
  )
)

shinyApp(ui = ui, server = server)


server <- function(input, output, session) {
  rval_bmi <- reactive({
    input$weight/(input$height^2)
  })
  # CODE BELOW: Add a reactive expression rval_bmi_status to 
  # return health status as underweight etc. based on inputs
  rval_bmi_status <- reactive({
      cut(rval_bmi(), breaks = c(0, 18.5, 24.9, 29.9, 40), 
          labels = c('underweight', 'healthy', 'overweight', 'obese')
          )
  })
  output$bmi <- renderText({
    bmi <- rval_bmi()
    paste("Your BMI is", round(bmi, 1))
  })
  output$bmi_status <- renderText({
    # MODIFY CODE BELOW: Replace right-hand-side with 
    # reactive expression rval_bmi_status
    bmi_status <- rval_bmi_status()
    paste("You are", bmi_status)
  })
}
ui <- fluidPage(
  titlePanel('BMI Calculator'),
  sidebarLayout(
    sidebarPanel(
      numericInput('height', 'Enter your height in meters', 1.5, 1, 2),
      numericInput('weight', 'Enter your weight in Kilograms', 60, 45, 120)
    ),
    mainPanel(
      textOutput("bmi"),
      textOutput("bmi_status")
    )
  )
)

shinyApp(ui = ui, server = server)


ui <- fluidPage(
    textInput('name', 'Enter your name')
)

server <- function(input, output, session) {
    # CODE BELOW: Add an observer to display a notification
    # 'You have entered the name xxxx' where xxxx is the name
    observe({showNotification(paste0("You have entered the name ", input$name))})
}

shinyApp(ui = ui, server = server)


server <- function(input, output, session) {
  rval_bmi <- reactive({
    input$weight/(input$height^2)
  })
  output$bmi <- renderText({
    bmi <- rval_bmi()
    # MODIFY CODE BELOW: 
    # Use isolate to stop output from updating when name changes.
    paste("Hi", isolate({input$name}), ". Your BMI is", round(bmi, 1))
  })
}

ui <- fluidPage(
  titlePanel('BMI Calculator'),
  sidebarLayout(
    sidebarPanel(
      textInput('name', 'Enter your name'),
      numericInput('height', 'Enter your height (in m)', 1.5, 1, 2, step = 0.1),
      numericInput('weight', 'Enter your weight (in Kg)', 60, 45, 120)
    ),
    mainPanel(
      textOutput("bmi")
    )
  )
)

shinyApp(ui = ui, server = server)


server <- function(input, output, session) {
  # MODIFY CODE BELOW: Use eventReactive to delay the execution of the
  # calculation until the user clicks on the show_bmi button (Show BMI)
  rval_bmi <- eventReactive(input$show_bmi, {
    input$weight/(input$height^2)
  })
  output$bmi <- renderText({
    bmi <- rval_bmi()
    paste("Hi", input$name, ". Your BMI is", round(bmi, 1))
  })
}

ui <- fluidPage(
  titlePanel('BMI Calculator'),
  sidebarLayout(
    sidebarPanel(
      textInput('name', 'Enter your name'),
      numericInput('height', 'Enter height (in m)', 1.5, 1, 2, step = 0.1),
      numericInput('weight', 'Enter weight (in Kg)', 60, 45, 120),
      actionButton("show_bmi", "Show BMI")
    ),
    mainPanel(
      textOutput("bmi")
    )
  )
)

shinyApp(ui = ui, server = server)


server <- function(input, output, session) {
  # MODIFY CODE BELOW: Wrap in observeEvent() so the help text 
  # is displayed when a user clicks on the Help button.
  observeEvent(input$show_help, {
     # Display a modal dialog with bmi_help_text
     # MODIFY CODE BELOW: Uncomment code
     showModal(modalDialog(bmi_help_text))
  })
  rv_bmi <- eventReactive(input$show_bmi, {
    input$weight/(input$height^2)
  })
  output$bmi <- renderText({
    bmi <- rv_bmi()
    paste("Hi", input$name, ". Your BMI is", round(bmi, 1))
  })
}

ui <- fluidPage(
  titlePanel('BMI Calculator'),
  sidebarLayout(
    sidebarPanel(
      textInput('name', 'Enter your name'),
      numericInput('height', 'Enter your height in meters', 1.5, 1, 2),
      numericInput('weight', 'Enter your weight in Kilograms', 60, 45, 120),
      actionButton("show_bmi", "Show BMI"), 
      # CODE BELOW: Add an action button named "show_help"
      actionButton("show_help", "Help")
    ),
    mainPanel(
      textOutput("bmi")
    )
  )
)

shinyApp(ui = ui, server = server)


server <- function(input, output, session) {
  # MODIFY CODE BELOW: Delay the height calculation until
  # the show button is pressed
  rval_height_cm <- eventReactive(input$show_height_cm, {
    input$height * 2.54
  })
  
  output$height_cm <- renderText({
    height_cm <- rval_height_cm()
    
    })
}

ui <- fluidPage(
  titlePanel("Inches to Centimeters Conversion"),
  sidebarLayout(
    sidebarPanel(
      numericInput("height", "Height (in)", 60),
      actionButton("show_height_cm", "Show height in cm")
    ),
    mainPanel(
      textOutput("height_cm")
    )
  )
)

shinyApp(ui = ui, server = server)

Chapter 4 - Build Shiny Apps

Build an Alien Sightings Dashboard:

  • National UFO Center contains details on UFO sigthings worldwide

Explore the 2014 Mental Health Tech Survey:

  • Example of plotting data and adding error messages for a dashboard
  • Custom error messages can be added using validate() - for example
    • validate(need(input$age != "“,”Be sure to select an age"))
  • The shinyWidgets package includes a gallery capability for apps

Explore Cuisines:

  • Example at looking at ingredients by major cuisine types
    • Word Cloud
    • Bar Plot
    • DT Table

Mass Shootings:

  • US mass shooting data from 1982 to present
  • Can use the bootstrapPage() to allow for full page with no margins
    • theme=shinyThemes(“simplex”)
    • leaflet::leafletOutput(“map”, width=“100%”, height=“100%”),
    • absolutePanel(top=10, right=10, id=“controls”, sliderInput(…), dateRangeInput(…))

Wrap Up:

  • Shiny App and examples - client-server
  • Inputs, Outputs, Layouts
  • Reactivity and Stop-Delay-Trigger
  • Case Study Applications

Example code includes:

usa_ufo_sightings <- readr::read_csv("./RInputFiles/usa_ufo_sightings.csv")
mental_health_survey <- readr::read_csv("./RInputFiles/mental_health_survey_edited.csv")
recipes <- readRDS("./RInputFiles/recipes.rds")
mass_shootings <- readr::read_csv("./RInputFiles/mass-shootings.csv")

str(usa_ufo_sightings, give.attr=FALSE)
str(mental_health_survey, give.attr=FALSE)
str(recipes, give.attr=FALSE)
str(mass_shootings, give.attr=FALSE)


states <- sort(unique(usa_ufo_sightings$state))
ui <- fluidPage(
  # CODE BELOW: Add a title
  titlePanel("UFO Sightings"),
  sidebarLayout(
    sidebarPanel(
      # CODE BELOW: One input to select a U.S. state
      # And one input to select a range of dates
      selectInput("state", "Choose a U.S. state:", selected="AK", choices=states),
      dateRangeInput("date", "Choose a date range:", start="1920-01-01", end="1950-01-01")
    ),
  mainPanel()
  )
)

server <- function(input, output) {

}

shinyApp(ui, server)


server <- function(input, output) {
  # CODE BELOW: Create a plot output name 'shapes', of sightings by shape,
  # For the selected inputs
  output$shapes <- renderPlot({
    usa_ufo_sightings %>%
      filter(state == input$state,
             date_sighted >= input$dates[1],
             date_sighted <= input$dates[2]) %>%
      ggplot(aes(shape)) +
      geom_bar() +
      labs(x = "Shape", y = "# Sighted")
  })
  # CODE BELOW: Create a table output named 'duration_table', by shape, 
  # of # sighted, plus mean, median, max, and min duration of sightings
  # for the selected inputs
  output$duration_table <- renderTable({
    usa_ufo_sightings %>%
      filter(
        state == input$state,
        date_sighted >= input$dates[1],
        date_sighted <= input$dates[2]
      ) %>%
      group_by(shape) %>%
      summarize(
        nb_sighted = n(),
        avg_duration_min = mean(duration_sec) / 60,
        median_duration_min = median(duration_sec) / 60,
        min_duration_min = min(duration_sec) / 60,
        max_duration_min = max(duration_sec) / 60
      )
  })
}

ui <- fluidPage(
  titlePanel("UFO Sightings"),
  sidebarLayout(
    sidebarPanel(
      selectInput("state", "Choose a U.S. state:", choices = unique(usa_ufo_sightings$state)),
      dateRangeInput("dates", "Choose a date range:",
                     start = "1920-01-01",
                     end = "1950-01-01")
    ),
    mainPanel(
      # Add plot output named 'shapes'
      plotOutput("shapes"),
      # Add table output named 'duration_table'
      tableOutput("duration_table")
    )
  )
)

shinyApp(ui, server)


ui <- fluidPage(
  titlePanel("UFO Sightings"),
  sidebarPanel(
    selectInput("state", "Choose a U.S. state:", choices = unique(usa_ufo_sightings$state)),
    dateRangeInput("dates", "Choose a date range:",
      start = "1920-01-01",
      end = "1950-01-01"
    )
  ),
  # MODIFY CODE BELOW: Create a tab layout for the dashboard
  mainPanel(
    tabsetPanel( tabPanel("Plot", plotOutput("shapes")), tabPanel("Table", tableOutput("duration_table")) )
  )
)

server <- function(input, output) {
  output$shapes <- renderPlot({
    usa_ufo_sightings %>%
      filter(
        state == input$state,
        date_sighted >= input$dates[1],
        date_sighted <= input$dates[2]
      ) %>%
      ggplot(aes(shape)) +
      geom_bar() +
      labs(
        x = "Shape",
        y = "# Sighted"
      )
  })

  output$duration_table <- renderTable({
    usa_ufo_sightings %>%
      filter(
        state == input$state,
        date_sighted >= input$dates[1],
        date_sighted <= input$dates[2]
      ) %>%
      group_by(shape) %>%
      summarize(
        nb_sighted = n(),
        avg_duration_min = mean(duration_sec) / 60,
        median_duration_min = median(duration_sec) / 60,
        min_duration_min = min(duration_sec) / 60,
        max_duration_min = max(duration_sec) / 60
      )
  })
}

shinyApp(ui, server)


ui <- fluidPage(
  # CODE BELOW: Add an appropriate title
  titlePanel("2014 Mental Health in Tech Survey"), 
  sidebarPanel(
  
    checkboxGroupInput("mental_health_consequence", "Do you think that discussing a mental health issue with your employer would have negative consequences?", choices=c("Maybe", "Yes", "No"), selected="Maybe"), 
  
    shinyWidgets::pickerInput("mental_vs_physical", "Do you feel that your employer takes mental health as seriously as physical health?", choices=c("Don't know", "Yes","No"), selected="Nothing selected")
  ),
  mainPanel(

    plotOutput("ageHist")
  )
)

server <- function(input, output, session) {
  # CODE BELOW: Build a histogram of the age of respondents
  # Filtered by the two inputs
  output$ageHist <- renderPlot({
    mental_health_survey %>%
      filter(mental_health_consequence==input$mental_health_consequence, 
             mental_vs_physical==input$mental_vs_physical
             ) %>%
      ggplot(aes(x=Age)) + 
        geom_histogram()
      
  })
}

shinyApp(ui, server)


server <- function(input, output, session) {
  output$age <- renderPlot({
    # MODIFY CODE BELOW: Add validation that user selected a 3rd input
    validate(
      need(
        input$mental_vs_physical != "", 
        "Make a selection for mental vs. physical health."
      )
    )

    mental_health_survey %>%
      filter(
        work_interfere == input$work_interfere,
        mental_health_consequence %in% input$mental_health_consequence,
        mental_vs_physical %in% input$mental_vs_physical
      ) %>%
      ggplot(aes(Age)) +
      geom_histogram()
  })
}

ui <- fluidPage(
  titlePanel("2014 Mental Health in Tech Survey"),
  sidebarPanel(
    shinyWidgets::sliderTextInput(
      inputId = "work_interfere",
      label = "If you have a mental health condition, do you feel that it interferes with your work?", 
      grid = TRUE,
      force_edges = TRUE,
      choices = c("Never", "Rarely", "Sometimes", "Often")
    ),
    checkboxGroupInput(
      inputId = "mental_health_consequence",
      label = "Do you think that discussing a mental health issue with your employer would have negative consequences?", 
      choices = c("Maybe", "Yes", "No"),
      selected = "Maybe"
    ),
    shinyWidgets::pickerInput(
      inputId = "mental_vs_physical",
      label = "Do you feel that your employer takes mental health as seriously as physical health?", 
      choices = c("Don't Know", "No", "Yes"),
      multiple = TRUE
    )
  ),
  mainPanel(
    plotOutput("age")  
  )
)

shinyApp(ui, server)


oldRecipe <- recipes
cuisineList <- vector("list", nrow(oldRecipe))

for (thisRow in 1:nrow(recipes)) {
    cuisineList[[thisRow]] <- data.frame(id=oldRecipe$id[thisRow], cuisine=oldRecipe$cuisine[thisRow], 
                                         ingredient=oldRecipe$ingredients[thisRow][[1]],
                                         stringsAsFactors=FALSE
                                         )
}

recipes <- bind_rows(cuisineList)
str(recipes)


ui <- fluidPage(
  titlePanel('Explore Cuisines'),
  sidebarLayout(
    sidebarPanel(
      # CODE BELOW: Add an input named "cuisine" to select a cuisine
      selectInput("cuisine", "Select Cuisine", choices=unique(recipes$cuisine), selected="greek"),
      # CODE BELOW: Add an input named "nb_ingredients" to select # of ingredients
      sliderInput("nb_ingredients", "Select No. of Ingredients", min=1, max=100, value=10)
    ),
    mainPanel(
      # CODE BELOW: Add a DT output named "dt_top_ingredients"
      DT::DTOutput("dt_top_ingredients")
    )
  )
)

server <- function(input, output, session) {
  # CODE BELOW: Render the top ingredients in a chosen cuisine as 
  # an interactive data table and assign it to output object `dt_top_ingredients`
  output$dt_top_ingredients <- DT::renderDT({
    recipes %>%
      filter(cuisine == input$cuisine) %>%
      count(ingredient, name="nb_recipes") %>%
      arrange(desc(nb_recipes)) %>%
      head(input$nb_ingredients)
  })
}

shinyApp(ui, server)


recipes_enriched <- recipes %>%
    count(cuisine, ingredient, name="nb_recipes") %>%
    tidytext::bind_tf_idf(term="ingredient", document="cuisine", n="nb_recipes")
str(recipes_enriched)


ui <- fluidPage(
  titlePanel('Explore Cuisines'),
  sidebarLayout(
    sidebarPanel(
      selectInput('cuisine', 'Select Cuisine', unique(recipes$cuisine)),
      sliderInput('nb_ingredients', 'Select No. of Ingredients', 1, 100, 10),
    ),
    mainPanel(
      tabsetPanel(
        # CODE BELOW: Add a plotly output named "plot_dt_ingredients"
        tabPanel("Plot", plotly::plotlyOutput("plot_top_ingredients")),
        tabPanel('Table', DT::DTOutput('dt_top_ingredients'))
      )
    )
  )
)

server <- function(input, output, session) {
  # CODE BELOW: Add a reactive expression named `rval_top_ingredients` that
  # filters `recipes_enriched` for the selected cuisine and top ingredients
  # based on the tf_idf value.
  rval_top_ingredients <- reactive({
    recipes_enriched %>%
      filter(cuisine==input$cuisine) %>%
      arrange(desc(tf_idf)) %>%
      head(input$nb_ingredients)
  })
  
  # CODE BELOW: Render a horizontal bar plot of top ingredients and 
  # the tf_idf of recipes they get used in, and assign it to an output named 
  # `plot_top_ingredients` 
  output$plot_top_ingredients <- plotly::renderPlotly({
    ggplot(rval_top_ingredients(), aes(x=ingredient, y=tf_idf)) + 
      geom_col() + 
      coord_flip()
  })
  
  output$dt_top_ingredients <- DT::renderDT({
    recipes %>% 
      filter(cuisine == input$cuisine) %>% 
      count(ingredient, name = 'nb_recipes') %>% 
      arrange(desc(nb_recipes)) %>% 
      head(input$nb_ingredients)
  })
}

shinyApp(ui, server)


# ui <- fluidPage(
#   titlePanel('Explore Cuisines'),
#   sidebarLayout(
#     sidebarPanel(
#       selectInput('cuisine', 'Select Cuisine', unique(recipes$cuisine)),
#       sliderInput('nb_ingredients', 'Select No. of Ingredients', 5, 100, 20),
#     ),
#     mainPanel(
#       tabsetPanel(
        # CODE BELOW: Add `d3wordcloudOutput` named `wc_ingredients` in a `tabPanel`
#         tabPanel("Word Cloud", wordcloud2::wordcloud2Output("wc_ingredients")),
#         tabPanel('Plot', plotly::plotlyOutput('plot_top_ingredients')),
#         tabPanel('Table', DT::DTOutput('dt_top_ingredients'))
#       )
#     )
#   )
# )
# server <- function(input, output, session){
  # CODE BELOW: Render an interactive wordcloud of top ingredients and 
  # the number of recipes they get used in, using `d3wordcloud::renderD3wordcloud`,
  # and assign it to an output named `wc_ingredients`.
#   output$wc_ingredients <- wordcloud2::renderWordcloud2({
#     d <- rval_top_ingredients()
#     wordcloud2::wordcloud2(d)
#   })
#   rval_top_ingredients <- reactive({
#     recipes_enriched %>% 
#       filter(cuisine == input$cuisine) %>% 
#       arrange(desc(tf_idf)) %>% 
#       head(input$nb_ingredients) %>% 
#       mutate(ingredient = forcats::fct_reorder(ingredient, tf_idf), word=as.character(ingredient),
#              freq=nb_recipes
#              )
#   })
#   output$plot_top_ingredients <- plotly::renderPlotly({
#     rval_top_ingredients() %>%
#       ggplot(aes(x = ingredient, y = tf_idf)) +
#       geom_col() +
#       coord_flip()
#   })
#   output$dt_top_ingredients <- DT::renderDT({
#     recipes %>% 
#       filter(cuisine == input$cuisine) %>% 
#       count(ingredient, name = 'nb_recipes') %>% 
#       arrange(desc(nb_recipes)) %>% 
#       head(input$nb_ingredients)
#   })
# }
# shinyApp(ui = ui, server= server)


mass_shootings$date <- lubridate::mdy(mass_shootings$date)


ui <- bootstrapPage(
  theme = shinythemes::shinytheme('simplex'),
  leaflet::leafletOutput('map', width = '100%', height = '100%'),
  absolutePanel(top = 10, right = 10, id = 'controls',
    sliderInput('nb_fatalities', 'Minimum Fatalities', 1, 40, 10),
    dateRangeInput(
      'date_range', 'Select Date', "2010-01-01", "2019-12-01"
    ),
    # CODE BELOW: Add an action button named show_about
    actionButton("show_about", "About")
  ),
  tags$style(type = "text/css", "
    html, body {width:100%;height:100%}     
    #controls{background-color:white;padding:20px;}
  ")
)
server <- function(input, output, session) {
  # CODE BELOW: Use observeEvent to display a modal dialog
  # with the help text stored in text_about.
  observeEvent(input$show_about, {
    showModal(modalDialog(text_about, title="About"))
  })
  output$map <- leaflet::renderLeaflet({
    mass_shootings %>% 
      filter(
        date >= input$date_range[1],
        date <= input$date_range[2],
        fatalities >= input$nb_fatalities
      ) %>% 
      leaflet::leaflet() %>% 
      leaflet::setView( -98.58, 39.82, zoom = 5) %>% 
      leaflet::addTiles() %>% 
      leaflet::addCircleMarkers(
        popup = ~ summary, radius = ~ sqrt(fatalities)*3,
        fillColor = 'red', color = 'red', weight = 1
      )
  })
}

shinyApp(ui, server)

Intermediate Data Visualization with ggplot2

Chapter 1 - Statistics

Stats with Geoms:

  • Statistics can be called independently, or within a geom
    • All start with stat_
    • geom_bar() runs stat_count() by default
    • geom_smooth() runs stat_smooth() by default - defaults to lm for less than 1000 and gam for more than 1000

Stats: Sum and Quantile:

  • Over-plotting is frequently a concern with large, overlapping datasets
    • Can use geom_count() and stat_sum() to get counts rather than just over-plotting
    • By default, geom_count() will size points by the number of observations
  • The geom_quantile() is a great tool for describing the data
    • Associated with stat_quantile() as well

Stats Outside Geoms:

  • Can use ggplot for calculating statistics
    • mean_sdl(xx, mult=1) # will be +/- 1 SD
    • Can use stat_summary() to summarize y by x
    • Can use stat_function() to compute y as a function of x
    • Can use stat_qq() to perform calculation from a quantile-quantile plot

Example code includes:

# View the structure of mtcars
data(mtcars)
str(mtcars)
## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...
# Using mtcars, draw a scatter plot of mpg vs. wt
ggplot(mtcars, aes(x=wt, y=mpg)) + 
  geom_point()

# Amend the plot to add a smooth layer
ggplot(mtcars, aes(x = wt, y = mpg)) +
  geom_point() + 
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# Amend the plot. Use lin. reg. smoothing; turn off std err ribbon
ggplot(mtcars, aes(x = wt, y = mpg)) +
  geom_point() +
  geom_smooth(method="lm", se=FALSE)

# Amend the plot. Swap geom_smooth() for stat_smooth().
ggplot(mtcars, aes(x = wt, y = mpg)) +
  geom_point() +
  stat_smooth(method = "lm", se = FALSE)

mtcars <- mtcars %>%
    mutate(fcyl=factor(cyl), fam=factor(am))
str(mtcars)
## 'data.frame':    32 obs. of  13 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...
##  $ fcyl: Factor w/ 3 levels "4","6","8": 2 2 1 2 3 2 3 1 1 2 ...
##  $ fam : Factor w/ 2 levels "0","1": 2 2 2 1 1 1 1 1 1 1 ...
# Using mtcars, plot mpg vs. wt, colored by fcyl
ggplot(mtcars, aes(x=wt, y=mpg, color=fcyl)) +
  # Add a point layer
  geom_point() +
  # Add a smooth lin reg stat, no ribbon
  stat_smooth(method="lm", se=FALSE)

# Amend the plot to add another smooth layer with dummy grouping
ggplot(mtcars, aes(x = wt, y = mpg, color = fcyl)) +
  geom_point() +
  stat_smooth(method = "lm", se = FALSE) +
  stat_smooth(aes(group=1), method="lm", se=FALSE)

ggplot(mtcars, aes(x = wt, y = mpg)) +
  geom_point() +
  # Add 3 smooth LOESS stats, varying span & color
  stat_smooth(method = "loess", color = "red", span = 0.9, se=FALSE) +
  stat_smooth(method = "loess", color = "green", span = 0.6, se=FALSE) +
  stat_smooth(method = "loess", color = "blue", span = 0.3, se=FALSE)

# Amend the plot to color by fcyl
ggplot(mtcars, aes(x = wt, y = mpg)) +
  geom_point() +
  # Add a smooth LOESS stat, no ribbon
  stat_smooth(method="loess", se=FALSE) +
  # Add a smooth lin. reg. stat, no ribbon
  stat_smooth(method="lm", se=FALSE)

# Amend the plot
ggplot(mtcars, aes(x = wt, y = mpg, color = fcyl)) +
  geom_point() +
  # Map color to dummy variable "All"
  stat_smooth(aes(color="All"), se = FALSE) +
  stat_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

data(Vocab, package="carData")
Vocab <- Vocab %>%
    mutate(year_group=factor(ifelse(year<=1994, 1974, 2016)))
str(Vocab)
## 'data.frame':    30351 obs. of  5 variables:
##  $ year      : num  1974 1974 1974 1974 1974 ...
##  $ sex       : Factor w/ 2 levels "Female","Male": 2 2 1 1 1 2 2 2 1 1 ...
##  $ education : num  14 16 10 10 12 16 17 10 12 11 ...
##  $ vocabulary: num  9 9 9 5 8 8 9 5 3 5 ...
##  $ year_group: Factor w/ 2 levels "1974","2016": 1 1 1 1 1 1 1 1 1 1 ...
# Using Vocab, plot vocabulary vs. education, colored by year group
ggplot(Vocab, aes(x=education, y=vocabulary, color=year_group)) +
  # Add jittered points with transparency 0.25
  geom_jitter(alpha=0.25) +
  # Add a smooth lin. reg. line (with ribbon)
  stat_smooth(method="lm")

# Amend the plot
ggplot(Vocab, aes(x = education, y = vocabulary, color = year_group)) +
  geom_jitter(alpha = 0.25) +
  # Map the fill color to year_group, set the line size to 2
  stat_smooth(method = "lm", aes(fill=year_group), size=2)

# Amend the plot to color by year_group
ggplot(Vocab, aes(x = education, y = vocabulary)) +
  geom_jitter(alpha = 0.25) +
  stat_quantile(quantiles = c(0.05, 0.5, 0.95))
## Smoothing formula not specified. Using: y ~ x

# Amend the plot to color by year_group
ggplot(Vocab, aes(x = education, y = vocabulary, color=year_group)) +
  geom_jitter(alpha = 0.25) +
  stat_quantile(quantiles = c(0.05, 0.5, 0.95))
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x
## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique

# Run this, look at the plot, then update it
ggplot(Vocab, aes(x = education, y = vocabulary)) +
  # Replace this with a sum stat
  stat_sum()

ggplot(Vocab, aes(x = education, y = vocabulary)) +
  stat_sum() +
  # Add a size scale, from 1 to 10
  scale_size(range=c(1, 10))

# Amend the stat to use proportion sizes
ggplot(Vocab, aes(x = education, y = vocabulary)) +
  stat_sum(aes(size = ..prop..))

# Amend the plot to group by education
ggplot(Vocab, aes(x = education, y = vocabulary, group = education)) +
  stat_sum(aes(size = ..prop..))

# From previous step
posn_j <- position_jitter(width = 0.2)
posn_d <- position_dodge(width = 0.1)
posn_jd <- position_jitterdodge(jitter.width = 0.2, dodge.width = 0.1)

# Create the plot base: wt vs. fcyl, colored by fam
p_wt_vs_fcyl_by_fam <- ggplot(mtcars, aes(x=fcyl, y=wt, color=fam))

# Add a point layer
p_wt_vs_fcyl_by_fam +
  geom_point()

# Add jittering only
p_wt_vs_fcyl_by_fam +
    geom_point(position=posn_j)

# Add dodging only
p_wt_vs_fcyl_by_fam +
    geom_point(position=posn_d)

# Add jittering and dodging
p_wt_vs_fcyl_by_fam_jit <- p_wt_vs_fcyl_by_fam +
    geom_point(position=posn_jd)
p_wt_vs_fcyl_by_fam_jit

p_wt_vs_fcyl_by_fam_jit +
  # Add a summary stat of std deviation limits
  stat_summary(fun.data=mean_sdl, fun.args=list(mult=1), position=posn_d)

p_wt_vs_fcyl_by_fam_jit +
  # Change the geom to be an errorbar
  stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), position = posn_d, geom="errorbar")

p_wt_vs_fcyl_by_fam_jit +
  # Add a summary stat of normal confidence limits
  stat_summary(fun.data = mean_cl_normal, position = posn_d)


Chapter 2 - Coordinates

Coordinates:

  • The coordinate layer is represented by coord_ functions
    • The default is coord_cartesian
    • myPlot + coord_cartesian(xlim=…)
    • Changing the x/y coordinates is risky and should be applied with care
  • Aspect ratios should typically be 1:1 when the units of measure are the same
    • Changing the aspect ratio can inadvertently or deliberately convey the wrong message
    • Can use coord_fixed(asp_ratio) to set these

Coordinates vs Scales:

  • Can use log or log10 for data with a highly positive skew
    • scale_x_log10()

Double and Flipped Axes:

  • Double x/y axis is generally strongly discouraged but occasionally useful
  • Flipped axes can be useful for adjusting geometries or meanings of axes
    • coord_flip() # can only use one coord_ per plot, so this precludes changing the aspect ratio

Polar Coordinates:

  • Projections can map objects on to a 2D space
  • Can create a polar transformation using coord_polar()
    • Commonly, coord_polar(theta=“y”)
  • Polar coordinates considerably distort the data and should be used with significant caution

Example code includes:

ggplot(mtcars, aes(x = wt, y = hp, color = fam)) +
  geom_point() +
  geom_smooth() +
  # Add Cartesian coordinates with x limits from 3 to 6
  coord_cartesian(xlim=c(3, 6))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

data(iris)
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width, color = Species)) +
  geom_jitter() +
  geom_smooth(method = "lm", se = FALSE) +
  # Fix the coordinate ratio
  coord_fixed(1)

data(sunspot.month)
str(sunspot.month)
##  Time-Series [1:3177] from 1749 to 2014: 58 62.6 70 55.7 85 83.5 94.8 66.3 75.9 75.5 ...
sunspots <- data.frame(Date=lubridate::ymd("1749-1-1") + months(0:(length(sunspot.month)-1)), 
                       Sunspots=as.numeric(sunspot.month)
                       )
str(sunspots)
## 'data.frame':    3177 obs. of  2 variables:
##  $ Date    : Date, format: "1749-01-01" "1749-02-01" ...
##  $ Sunspots: num  58 62.6 70 55.7 85 83.5 94.8 66.3 75.9 75.5 ...
sun_plot <- ggplot(sunspots, aes(x=Date, y=Sunspots)) + 
    geom_line(col="lightblue") + 
    geom_rect(aes(xmin=as.Date("1860-01-01"), xmax=as.Date("1935-01-01"), ymin=175, ymax=250), 
              col="orange", fill=NA
              )


# Fix the aspect ratio to 1:1
sun_plot +
  coord_fixed(1)

# Change the aspect ratio to 20:1
sun_plot +
  coord_fixed(20)

ggplot(mtcars, aes(wt, mpg)) +
  geom_point(size = 2) +
  # Add Cartesian coordinates with zero expansion
  coord_cartesian(expand=0) +
  theme_classic()

ggplot(mtcars, aes(wt, mpg)) +
  geom_point(size = 2) +
  # Turn clipping off
  coord_cartesian(expand = 0, clip="off") +
  theme_classic() +
  # Remove axis lines
  theme(axis.line=element_blank())

data(msleep, package="ggplot2")
msleep <- msleep %>%
    select(bodywt, brainwt, vore) %>%
    filter(complete.cases(.))
str(msleep)
## Classes 'tbl_df', 'tbl' and 'data.frame':    51 obs. of  3 variables:
##  $ bodywt : num  0.48 0.019 600 14 14.8 33.5 0.728 0.42 0.06 1 ...
##  $ brainwt: num  0.0155 0.00029 0.423 0.07 0.0982 0.115 0.0055 0.0064 0.001 0.0066 ...
##  $ vore   : chr  "omni" "omni" "herbi" "carni" ...
# Produce a scatter plot of brainwt vs. bodywt
ggplot(msleep, aes(x=bodywt, y=brainwt)) +
  geom_point() +
  ggtitle("Raw Values")

# Add scale_*_*() functions
ggplot(msleep, aes(bodywt, brainwt)) +
  geom_point() +
  scale_x_log10() +
  scale_y_log10() +
  ggtitle("Scale_ functions")

# Perform a log10 coordinate system transformation
ggplot(msleep, aes(bodywt, brainwt)) +
  geom_point() +
  coord_trans(x="log10", y="log10")

# Plot with a scale_*_*() function:
ggplot(msleep, aes(bodywt, brainwt)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  # Add a log10 x scale
  scale_x_log10() +
  # Add a log10 y scale
  scale_y_log10() +
  ggtitle("Scale functions")

# Plot with transformed coordinates
ggplot(msleep, aes(bodywt, brainwt)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  # Add a log10 coordinate transformation for x and y axes
  coord_trans(x="log10", y="log10")

data(airquality)
airquality <- airquality %>%
    mutate(Date=lubridate::ymd(paste0("1973-", Month, "-", Day)))
str(airquality)
## 'data.frame':    153 obs. of  7 variables:
##  $ Ozone  : int  41 36 12 18 NA 28 23 19 8 NA ...
##  $ Solar.R: int  190 118 149 313 NA NA 299 99 19 194 ...
##  $ Wind   : num  7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
##  $ Temp   : int  67 72 74 62 56 66 65 59 61 69 ...
##  $ Month  : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ Day    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Date   : Date, format: "1973-05-01" "1973-05-02" ...
# Using airquality, plot Temp vs. Date
ggplot(airquality, aes(x=Date, y=Temp)) +
  # Add a line layer
  geom_line() +
  labs(x = "Date (1973)", y = "Fahrenheit")

# Define breaks (Fahrenheit)
y_breaks <- c(59, 68, 77, 86, 95, 104)

# Convert y_breaks from Fahrenheit to Celsius
y_labels <- (y_breaks - 32) / 1.8

# Create a secondary x-axis
secondary_y_axis <- sec_axis(
  # Use identity transformation
  trans = "identity",
  name = "Celsius",
  # Define breaks and labels as above
  breaks = y_breaks,
  labels = y_labels
)

# Examine the object
secondary_y_axis
## <ggproto object: Class AxisSecondary, gg>
##     axis: NULL
##     break_info: function
##     breaks: 59 68 77 86 95 104
##     create_scale: function
##     detail: 1000
##     empty: function
##     init: function
##     labels: 15 20 25 30 35 40
##     make_title: function
##     mono_test: function
##     name: Celsius
##     trans: function
##     transform_range: function
##     super:  <ggproto object: Class AxisSecondary, gg>
# From previous step
y_breaks <- c(59, 68, 77, 86, 95, 104)
y_labels <- (y_breaks - 32) * 5 / 9
secondary_y_axis <- sec_axis(
  trans = identity,
  name = "Celsius",
  breaks = y_breaks,
  labels = y_labels
)

# Update the plot
ggplot(airquality, aes(Date, Temp)) +
  geom_line() +
  # Add the secondary y-axis 
  scale_y_continuous(sec.axis = secondary_y_axis) +
  labs(x = "Date (1973)", y = "Fahrenheit")

# Plot fcyl bars, filled by fam
ggplot(mtcars, aes(x=fcyl, fill = fam)) +
  # Place bars side by side
  geom_bar(position = "dodge")

ggplot(mtcars, aes(fcyl, fill = fam)) +
  # Set a dodge width of 0.5 for partially overlapping bars
  geom_bar(position = position_dodge(width=0.5)) +
  coord_flip()

mtcars$car <- c('Mazda RX4', 'Mazda RX4 Wag', 'Datsun 710', 'Hornet 4 Drive', 'Hornet Sportabout', 'Valiant', 'Duster 360', 'Merc 240D', 'Merc 230', 'Merc 280', 'Merc 280C', 'Merc 450SE', 'Merc 450SL', 'Merc 450SLC', 'Cadillac Fleetwood', 'Lincoln Continental', 'Chrysler Imperial', 'Fiat 128', 'Honda Civic', 'Toyota Corolla', 'Toyota Corona', 'Dodge Challenger', 'AMC Javelin', 'Camaro Z28', 'Pontiac Firebird', 'Fiat X1-9', 'Porsche 914-2', 'Lotus Europa', 'Ford Pantera L', 'Ferrari Dino', 'Maserati Bora', 'Volvo 142E')
str(mtcars)
## 'data.frame':    32 obs. of  14 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...
##  $ fcyl: Factor w/ 3 levels "4","6","8": 2 2 1 2 3 2 3 1 1 2 ...
##  $ fam : Factor w/ 2 levels "0","1": 2 2 2 1 1 1 1 1 1 1 ...
##  $ car : chr  "Mazda RX4" "Mazda RX4 Wag" "Datsun 710" "Hornet 4 Drive" ...
# Plot of wt vs. car
ggplot(mtcars, aes(x=car, y=wt)) +
  # Add a point layer
  geom_point() +
  labs(x = "car", y = "weight")

# Flip the axes to set car to the y axis
ggplot(mtcars, aes(car, wt)) +
  geom_point() +
  labs(x = "car", y = "weight") +
  coord_flip()

ggplot(mtcars, aes(x = 1, fill = fcyl)) +
  # Reduce the bar width to 0.1
  geom_bar(width=0.1) +
  coord_polar(theta = "y") +
  # Add a continuous x scale from 0.5 to 1.5
  scale_x_continuous(limits=c(0.5, 1.5))

dirs <- c("N", "NNE", "NE", "ENE", "E", "ESE", "SE", "SSE", 
          "S", "SSW", "SW", "WSW", "W", "WNW", "NW", "NNW"
          )

data(mydata, package="openair")
wind <- mydata %>%
    select(date, ws, wd) %>%
    filter(date >= as.Date("2003-01-01"), date <= as.Date("2003-12-31")) %>%
    mutate(orig_ws=ws, orig_wd=wd, base_ws=2 * (ws %/% 2), 
           base_wd=round(((wd + 11.25) %% 360) %/% 22.5), 
           ws=factor(ifelse(base_ws>=20, "20+", paste0(base_ws, "-", base_ws+2)), 
                     levels=c("20+", "18-20", "16-18", "14-16", "12-14", "10-12", "8-10", 
                              "6-8", "4-6", "2-4", "0-2")
                     ), 
           wd=factor(dirs[base_wd+1], levels=dirs)
           ) %>%
    filter(complete.cases(.))
str(wind)
## Classes 'tbl_df', 'tbl' and 'data.frame':    8735 obs. of  7 variables:
##  $ date   : POSIXct, format: "2003-01-01 00:00:00" "2003-01-01 01:00:00" ...
##  $ ws     : Factor w/ 11 levels "20+","18-20",..: 9 9 10 9 9 9 9 9 10 10 ...
##  $ wd     : Factor w/ 16 levels "N","NNE","NE",..: 8 7 7 7 7 7 8 8 8 9 ...
##  $ orig_ws: num  5.2 4.6 3.6 4.6 5.7 4.6 4.1 5.2 3.6 3.6 ...
##  $ orig_wd: int  160 140 140 140 140 140 160 160 160 180 ...
##  $ base_ws: num  4 4 2 4 4 4 4 4 2 2 ...
##  $ base_wd: num  7 6 6 6 6 6 7 7 7 8 ...
# Using wind, plot wd filled by ws
ggplot(wind, aes(x=wd, fill=ws)) +
  # Add a bar layer with width 1
  geom_bar(width=1)

# Convert to polar coordinates:
ggplot(wind, aes(wd, fill = ws)) +
  geom_bar(width = 1) +
  coord_polar()

# Convert to polar coordinates:
ggplot(wind, aes(wd, fill = ws)) +
  geom_bar(width = 1) +
  coord_polar(start = -pi/16)


Chapter 3 - Facets

Facets Layer:

  • Facets are multiple, smaller plots that each contain different cuts of the data
    • Typically, each facet is on the same coordinates and scale
    • facet_grid(rows ~ cols)
  • Proper splits for facets depend on the intended communication and interpretation of the plots

Facet Labels and Order:

  • Facets are frequently poorly labelled and/or in the wrong order
  • There is a labeller= argument for facet_grid()
    • label_both is an option that can create better plots
    • label_context is an option that can create better plots
  • Using fct_recode() can relabel level names in a factor variable
  • Using fct_relevel() can change the order of a factor variable

Facet Plotting Spaces:

  • Facets draw all plots on the same scale by default, which is usually advantageous
  • On occasion, such as when splitting by subsets of a continuous variable, it can be valuable for the facet scales/grids to be independent
    • scales=“free_x” will allow for the x axis to vary
    • scales=“free_y” will allow for the y axis to vary
    • scales=“free” will allow for the x axis AND y axis to vary

Facet Wrap and Margins:

  • The facet_wrap() can be useful when each plot should have its own plotting space
    • By default, these are all on their own scale and grid, which is usually not advantageous
    • Can be valuable when a categorical variable has many levels, and data by level is on much different scales
  • Margin plots can be created using margins=TRUE as the argument to facet_grid
    • Can also set the margins to just a single variable

Example code includes:

ggplot(mtcars, aes(wt, mpg)) + 
  geom_point() +
  # Facet rows by am
  facet_grid(rows=vars(am))

ggplot(mtcars, aes(wt, mpg)) + 
  geom_point() +
  # Facet columns by cyl
  facet_grid(cols=vars(cyl))

ggplot(mtcars, aes(wt, mpg)) + 
  geom_point() +
  # Facet rows by am and columns by cyl
  facet_grid(rows=vars(am), cols=vars(cyl))

# See the interaction column
mtcars <- mtcars %>%
    mutate(fcyl_fam=factor(paste0(fcyl, "_", fam)))
mtcars$fcyl_fam
##  [1] 6_1 6_1 4_1 6_0 8_0 6_0 8_0 4_0 4_0 6_0 6_0 8_0 8_0 8_0 8_0 8_0 8_0 4_1 4_1
## [20] 4_1 4_0 8_0 8_0 8_0 8_0 4_1 4_1 4_1 8_1 6_1 8_1 4_1
## Levels: 4_0 4_1 6_0 6_1 8_0 8_1
# Color the points by fcyl_fam
ggplot(mtcars, aes(x = wt, y = mpg, color = fcyl_fam)) +
  geom_point() +
  # Use a paired color palette
  scale_color_brewer(palette = "Paired")

# Update the plot to map disp to size
ggplot(mtcars, aes(x = wt, y = mpg, color = fcyl_fam, size=disp)) +
  geom_point() +
  scale_color_brewer(palette = "Paired")

# Update the plot
ggplot(mtcars, aes(x = wt, y = mpg, color = fcyl_fam, size = disp)) +
  geom_point() +
  scale_color_brewer(palette = "Paired") +
  # Grid facet on gear and vs
  facet_grid(rows = vars(gear), cols = vars(vs))

ggplot(mtcars, aes(wt, mpg)) + 
  geom_point() +
  # Facet rows by am using formula notation
  facet_grid(am ~ .)

ggplot(mtcars, aes(wt, mpg)) + 
  geom_point() +
  # Facet columns by cyl using formula notation
  facet_grid(. ~ cyl)

ggplot(mtcars, aes(wt, mpg)) + 
  geom_point() +
  # Facet rows by am and columns by cyl using formula notation
  facet_grid(am ~ cyl)

# Plot wt by mpg
ggplot(mtcars, aes(wt, mpg)) +
  geom_point() +
  # The default is label_value
  facet_grid(cols = vars(cyl))

# Plot wt by mpg
ggplot(mtcars, aes(wt, mpg)) +
  geom_point() +
  # Displaying both the values and the variables
  facet_grid(cols = vars(cyl), labeller = label_both)

# Plot wt by mpg
ggplot(mtcars, aes(wt, mpg)) +
  geom_point() +
  # Label context
  facet_grid(cols = vars(cyl), labeller = label_context)

# Plot wt by mpg
ggplot(mtcars, aes(wt, mpg)) +
  geom_point() +
  # Two variables
  facet_grid(cols = vars(vs, cyl), labeller = label_context)

# Make factor, set proper labels explictly
mtcars$fam <- factor(mtcars$am, labels = c(`0` = "automatic", `1` = "manual"))

# Default order is alphabetical
ggplot(mtcars, aes(wt, mpg)) +
  geom_point() +
  facet_grid(cols = vars(fam))

# Make factor, set proper labels explictly, and
# manually set the label order
mtcars$fam <- factor(mtcars$am, levels = c(1, 0), labels = c("manual", "automatic"))

# View again
ggplot(mtcars, aes(wt, mpg)) +
  geom_point() +
  facet_grid(cols = vars(fam))

ggplot(mtcars, aes(wt, mpg)) +
  geom_point() + 
  # Facet columns by cyl 
  facet_grid(cols=vars(cyl))

ggplot(mtcars, aes(wt, mpg)) +
  geom_point() + 
  # Update the faceting to free the x-axis scales
  facet_grid(cols = vars(cyl), scales="free_x")

ggplot(mtcars, aes(wt, mpg)) +
  geom_point() + 
  # Swap cols for rows; free the y-axis scales
  facet_grid(rows = vars(cyl), scales = "free_y")

ggplot(mtcars, aes(x = mpg, y = car, color = fam)) +
  geom_point() +
  # Facet rows by gear
  facet_grid(rows=vars(gear))

ggplot(mtcars, aes(x = mpg, y = car, color = fam)) +
  geom_point() +
  # Free the y scales and space
  facet_grid(rows = vars(gear), scales="free_y", space="free_y")

ggplot(Vocab, aes(x = education, y = vocabulary)) +
  stat_smooth(method = "lm", se = FALSE) +
  # Create facets, wrapping by year, using vars()
  facet_wrap(vars(year))

ggplot(Vocab, aes(x = education, y = vocabulary)) +
  stat_smooth(method = "lm", se = FALSE) +
  # Create facets, wrapping by year, using a formula
  facet_wrap(~ year)

ggplot(Vocab, aes(x = education, y = vocabulary)) +
  stat_smooth(method = "lm", se = FALSE) +
  # Update the facet layout, using 11 columns
  facet_wrap(~ year, ncol=11)

mtcars <- mtcars %>%
    mutate(fam=factor(am, levels=c(0, 1), labels=c("automatic", "manual")), 
           fvs=factor(vs, levels=c(0, 1), labels=c("V-shaped", "straight"))
           )
str(mtcars)
## 'data.frame':    32 obs. of  16 variables:
##  $ mpg     : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl     : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp    : num  160 160 108 258 360 ...
##  $ hp      : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat    : num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt      : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec    : num  16.5 17 18.6 19.4 17 ...
##  $ vs      : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am      : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear    : num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb    : num  4 4 1 1 2 1 4 2 2 4 ...
##  $ fcyl    : Factor w/ 3 levels "4","6","8": 2 2 1 2 3 2 3 1 1 2 ...
##  $ fam     : Factor w/ 2 levels "automatic","manual": 2 2 2 1 1 1 1 1 1 1 ...
##  $ car     : chr  "Mazda RX4" "Mazda RX4 Wag" "Datsun 710" "Hornet 4 Drive" ...
##  $ fcyl_fam: Factor w/ 6 levels "4_0","4_1","6_0",..: 4 4 2 3 5 3 5 1 1 3 ...
##  $ fvs     : Factor w/ 2 levels "V-shaped","straight": 1 1 2 2 1 2 1 2 2 2 ...
ggplot(mtcars, aes(x = wt, y = mpg)) + 
  geom_point() +
  # Facet rows by fvs and cols by fam
  facet_grid(rows=vars(fvs, fam), cols=vars(gear))

ggplot(mtcars, aes(x = wt, y = mpg)) + 
  geom_point() +
  # Update the facets to add margins
  facet_grid(rows = vars(fvs, fam), cols = vars(gear), margins=TRUE)

ggplot(mtcars, aes(x = wt, y = mpg)) + 
  geom_point() +
  # Update the facets to only show margins on fam
  facet_grid(rows = vars(fvs, fam), cols = vars(gear), margins = "fam")

ggplot(mtcars, aes(x = wt, y = mpg)) + 
  geom_point() +
  # Update the facets to only show margins on gear and fvs
  facet_grid(rows = vars(fvs, fam), cols = vars(gear), margins = c("gear", "fvs"))


Chapter 4 - Best Practices

Best Practices: Bar Plots:

  • Bar plots can be used for absolutes or for proportions
  • The dynamite plot is often misleading, particularly when the bar makes it seem like 0 is in range
    • Inidividual, jittered, data points can be more informative
    • Error bars alone (without bars) can be instructive also

Heatmaps: Use Case Scenario:

  • Colors on a continuous scale can be difficult to interpret
  • Dot plots can be a more valuable communication than colors
  • Heatmaps in the wrong situation risk being seen as shoing off rather than conveying a message about the data

Good Data can make Bad Plots:

  • There are many factors that contribute to a bad plot - depends on the data, message, audience, etc.
  • Common errors include
    • Wrong orientation - dependent vs. independent
    • Broekn axes for large gaps between high/lo with different scales (consider log transforms instead)
    • 3D plots, especially when the third axis serves little/no purpose
    • Double-y axes (can be OK with significant care, planning, labelling, etc.)

Example code includes:

# Plot wt vs. fcyl
ggplot(mtcars, aes(x = fcyl, y = wt)) +
  # Add a bar summary stat of means, colored skyblue
  stat_summary(fun.y = mean, geom = "bar", fill = "skyblue") +
  # Add an errorbar summary stat std deviation limits
  stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar", width = 0.1)

# Update the aesthetics to color and fill by fam
ggplot(mtcars, aes(x = fcyl, y = wt, color=fam, fill=fam)) +
  stat_summary(fun.y = mean, geom = "bar") +
  stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar", width = 0.1)

# Set alpha for the first and set position for each stat summary function
ggplot(mtcars, aes(x = fcyl, y = wt, color = fam, fill = fam)) +
  stat_summary(fun.y = mean, geom = "bar", alpha = 0.5, position = "dodge") +
  stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "errorbar", position = "dodge", width = 0.1)

# Define a dodge position object with width 0.9
posn_d <- position_dodge(width=0.9)

# For each summary stat, update the position to posn_d
ggplot(mtcars, aes(x = fcyl, y = wt, color = fam, fill = fam)) +
  stat_summary(fun.y = mean, geom = "bar", position = posn_d, alpha = 0.5) +
  stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), width = 0.1, position = posn_d, geom = "errorbar")

mtcars_by_cyl <- mtcars %>%
    group_by(cyl) %>%
    summarize(mean_wt=mean(wt), sd_wt=sd(wt), n_wt=n()) %>%
    mutate(prop=n_wt/sum(n_wt))
mtcars_by_cyl
## # A tibble: 3 x 5
##     cyl mean_wt sd_wt  n_wt  prop
##   <dbl>   <dbl> <dbl> <int> <dbl>
## 1     4    2.29 0.570    11 0.344
## 2     6    3.12 0.356     7 0.219
## 3     8    4.00 0.759    14 0.438
# Using mtcars_cyl, plot mean_wt vs. cyl
ggplot(mtcars_by_cyl, aes(x=cyl, y=mean_wt)) +
  # Add a bar layer with identity stat, filled skyblue
  geom_bar(stat="identity", fill="skyblue")

ggplot(mtcars_by_cyl, aes(x = cyl, y = mean_wt)) +
  # Swap geom_bar() for geom_col()
  geom_col(fill = "skyblue")

ggplot(mtcars_by_cyl, aes(x = cyl, y = mean_wt)) +
  # Set the width aesthetic to prop
  geom_col(fill = "skyblue", aes(width=prop))
## Warning: Ignoring unknown aesthetics: width

ggplot(mtcars_by_cyl, aes(x = cyl, y = mean_wt)) +
  geom_col(aes(width = prop), fill = "skyblue") +
  # Add an errorbar layer
  geom_errorbar(
    # ... at mean weight plus or minus 1 std dev
    aes(ymin=mean_wt-sd_wt, ymax=mean_wt+sd_wt),
    # with width 0.1
    width=0.1
  )
## Warning: Ignoring unknown aesthetics: width

data(barley, package="lattice")
str(barley)
## 'data.frame':    120 obs. of  4 variables:
##  $ yield  : num  27 48.9 27.4 39.9 33 ...
##  $ variety: Factor w/ 10 levels "Svansota","No. 462",..: 3 3 3 3 3 3 7 7 7 7 ...
##  $ year   : Factor w/ 2 levels "1932","1931": 2 2 2 2 2 2 2 2 2 2 ...
##  $ site   : Factor w/ 6 levels "Grand Rapids",..: 3 6 4 5 1 2 3 6 4 5 ...
# Using barley, plot variety vs. year, filled by yield
ggplot(barley, aes(x=year, y=variety, fill=yield)) +
  # Add a tile geom
  geom_tile()

# Previously defined
ggplot(barley, aes(x = year, y = variety, fill = yield)) +
  geom_tile() + 
  # Facet, wrapping by site, with 1 column
  facet_wrap(facets = vars(site), ncol = 1) +
  # Add a fill scale using an 2-color gradient
  scale_fill_gradient(low = "white", high = "red")

# A palette of 9 reds
red_brewer_palette <- RColorBrewer::brewer.pal(9, "Reds")

# Update the plot
ggplot(barley, aes(x = year, y = variety, fill = yield)) +
  geom_tile() + 
  facet_wrap(facets = vars(site), ncol = 1) +
  # Update scale to use n-colors from red_brewer_palette
  scale_fill_gradientn(colors=red_brewer_palette)

# The heat map we want to replace
# Don't remove, it's here to help you!
ggplot(barley, aes(x = year, y = variety, fill = yield)) +
  geom_tile() +
  facet_wrap( ~ site, ncol = 1) +
  scale_fill_gradientn(colors = RColorBrewer::brewer.pal(9, "Reds"))

# Using barley, plot yield vs. year, colored and grouped by variety
ggplot(barley, aes(x=year, y=yield, color=variety, group=variety)) +
  # Add a line layer
  geom_line() +
  # Facet, wrapping by site, with 1 row
  facet_wrap( ~ site, nrow = 1)

# Using barely, plot yield vs. year, colored, grouped, and filled by site
ggplot(barley, aes(x = year, y = yield, color = site, group = site, fill = site)) +
  # Add a line summary stat aggregated by mean
  stat_summary(fun.y = mean, geom = "line") +
  # Add a ribbon summary stat with 10% opacity, no color
  stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), geom = "ribbon", alpha = 0.1, color = NA)

data(ToothGrowth)
TG <- ToothGrowth
str(TG)
## 'data.frame':    60 obs. of  3 variables:
##  $ len : num  4.2 11.5 7.3 5.8 6.4 10 11.2 11.2 5.2 7 ...
##  $ supp: Factor w/ 2 levels "OJ","VC": 2 2 2 2 2 2 2 2 2 2 ...
##  $ dose: num  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
# Initial plot
growth_by_dose <- ggplot(TG, aes(dose, len, color = supp)) +
  stat_summary(fun.data = mean_sdl, fun.args = list(mult = 1), position = position_dodge(0.1)) +
  theme_classic()

# View plot
growth_by_dose

# Change type
TG$dose <- as.numeric(as.character(TG$dose))

# Plot
growth_by_dose <- ggplot(TG, aes(dose, len, color = supp)) +
  stat_summary(fun.data = mean_sdl,
               fun.args = list(mult = 1),
               position = position_dodge(0.2)) +
  stat_summary(fun.y = mean,
               geom = "line",
               position = position_dodge(0.1)) +
  theme_classic() +
  # Adjust labels and colors:
  labs(x = "Dose (mg/day)", y = "Odontoblasts length (mean, standard deviation)", color = "Supplement") +
  scale_color_brewer(palette = "Set1", labels = c("Orange juice", "Ascorbic acid")) +
  scale_y_continuous(limits = c(0,35), breaks = seq(0, 35, 5), expand = c(0,0))

# View plot
growth_by_dose

Practicing Statistics Interview Questions in R

Chapter 1 - Probability Distributions

Discrete Distributions:

  • Probability distributions in R consist of a prefix and the abbreviated name of a distribution
    • d - density
    • p - probability distribution
    • q - quantile function
    • r - random variable
  • Key distributions include
    • Discrete uniform distribution - example of throwing a fair die
    • Bernoulli distribution - example of a coin flip with probabilities p and 1-p
    • Binomial distribution - sum of outcomes of multiple Bernoulli distributions - rbinom(n, size=k, prob=p) - if k=1, this is a Bernoulli

Continuous Distributions:

  • Continuous distributions have densities rather than point probabilities - there are an infinite number of possible draws, so probability is of a range rather than a point (which would always be 0)
    • The area under the density function sums to 1, and the area under a given range is the probability of getting a value in that range
  • The normal distribution is a very common example of a continuous distribution
    • rnorm(n)
    • dnorm(x)
    • pnorm(q)

Central Limit Theorem:

  • The CLM states that the sampling distribution of the sampling means approaches a normal distribution as the samples get larger
  • The power of the CLM is that it works with all underlying distributions
    • Since many statistical tests (parametric) rely on a specific underlying distribution, the CLM is core to the field of parametric testing

Example code includes:

set.seed(123)

# Generate the outcomes of basketball shots
shots <- rbinom(n = 10, size = 1, p = 0.5)
print(shots)

# Draw the frequency chart of the results
barplot(table(shots))

set.seed(123)

# Generate the outcomes of basketball shots
shots <- rbinom(n = 10, size = 1, prob = 0.3)
print(shots)

# Draw the frequency chart of the results
barplot(table(shots))

set.seed(123)

# Generate the outcomes of basketball shots
shots <- rbinom(n = 10, size = 1, prob = 0.9)
print(shots)

# Draw the frequency chart of the results
barplot(table(shots))


# The probability of getting 6 tails
six_tails <- dbinom(x = 6, size = 10, p = 0.5)
print(six_tails)

# The probability of getting 7 or less tails
seven_or_less <- pbinom(q = 7, size = 10, p = 0.5)
print(seven_or_less)

# The probability of getting 5 or more tails
five_or_more <- 1 - pbinom(q = 4, size = 10, p = 0.5)
print(five_or_more)


# Probability that X is lower than 7
lower_than_seven <- punif(q = 7, min = 1, max = 10)
print(lower_than_seven)

# Probability that X is lower or equal to 4
four_or_lower <- punif(q = 4, min = 1, max = 10)
print(four_or_lower)

# Probability that X falls into the range [4, 7]
between_four_and_seven <- lower_than_seven - four_or_lower
print(between_four_and_seven)


set.seed(123)

# Set the sample size
n = 50000

# Generate random samples from three distributions
sample_N01 <- rnorm(n)
sample_N03 <- rnorm(n, mean = 0, sd = sqrt(3))
sample_N21 <- rnorm(n, mean = 2, sd = 1)

# Visualize the distributions
data <- data.frame(sample_N01, sample_N03, sample_N21)
data %>% gather(key = distribution, value) %>% 
    ggplot(aes(x = value, fill = distribution)) + 
    geom_density(alpha = 0.3)


set.seed(123)

# Generate data points
data <- rnorm(n = 1000)

# Inspect the distribution
hist(data)

# Compute the true probability and print it
true_probability <- 1 - pnorm(q = 2)
print(true_probability) 

# Compute the sample probability and print it
sample_probability <- mean(data > 2)
print(sample_probability)


set.seed(1)

# Create a sample of 20 die rolls
small_sample <- sample(1:6, size = 20, replace = TRUE)

# Calculate the mean of the small sample
mean(small_sample)

# Create a sample of 1000 die rolls
big_sample <- sample(1:6, size = 1000, replace = TRUE)

# Calculate the mean of the big sample
mean(big_sample)


die_outputs <- vector("integer", 1000)
mean_die_outputs <- vector("numeric", 1000)

# Simulate 1000 die roll outputs
for (i in 1:1000) {
    die_outputs[i] <- sample(1:6, size = 1)
}

# Visualize the number of occurrences of each result
barplot(table(die_outputs))

# Calculate 1000 means of 30 die roll outputs
for (i in 1:1000) {
    mean_die_outputs[i] <- mean(sample(1:6, size = 30, replace = TRUE))
}

# Inspect the distribution of the results
hist(mean_die_outputs)

Chapter 2 - Exploratory Data Analysis

Descriptive Statistics:

  • Central tendency measures and variability measures are commonly explored
  • Common central tendency measures include mean, median, and mode
    • In a symmetric distribution, the measures are all the same
    • Left-skewed means data piles up on the right (long left tail)
    • Right-skewed means data piles up on the left (long right tail)
  • Common variability measures include variance/standard deviation and IQR/range

Categorical Data:

  • Categorical data can be either nominal (non-ordered) or ordinal (ordered)
    • R stores categorical variables as categorical; using ordered=TRUE will make an ordinal (default is nominal)
  • Contingency tables are helpful for understanding nominal variables
  • Most machine learning algorithms require that categorical data be converted to numeric (R manages this behind the scenes)
    • One hot encoding is a common method used to make a column for each level

Time Series:

  • Time is irregular, so it is helpful to use a time-series based package such as xts
  • Can use merge() to join time-dependent datasets (default is all dates; can use all=FALSE to have only matches
    • Can apply na.locf() to carry-forward the last non-NA value
    • The apply.monthly() and apply.yearly() functions will summarize the data to monthly or yearly levels

Principal Component Analysis:

  • PCA is a technique for dimensionality reduction, particularly when there are many highly correlated variables
    • Can use either prcomp() or princomp()
    • pca <- prcomp(~ v1 + V2 + v3, data=df, rank=, tol=) # rank gives a preferred number of PCA while tol gives the threshold for SD of a PC relative to the SD of the first PC (will stop once below that)

Example code includes:

data(cats, package="MASS")
str(cats)


# Compute the average of Hwt
mean(cats$Hwt)

# Compute the median of Hwt
median(cats$Hwt)

# Inspect the distribution of Hwt
hist(cats$Hwt)


# Subset female cats
female_cats <- subset(cats, Sex == "F")

# Compute the variance of Bwt for females
var(female_cats$Bwt)

# Subset male cats
male_cats <- subset(cats, Sex == "M")

# Compute the variance of Bwt for males
var(male_cats$Bwt)


data(survey, package="MASS")
str(survey)


# Return the structure of Exer
str(survey$Exer)

# Create the ordered factor 
survey$Exer_ordered <- factor(survey$Exer, levels = c("None", "Some", "Freq"), ordered = TRUE)

# Return the structure of Exer_ordered
str(survey$Exer_ordered)

# Build a contingency table for Exer_ordered
table(survey$Exer_ordered)

# Compute mean pulse for groups
tapply(survey$Pulse, survey$Exer_ordered, mean, na.rm = TRUE)


library(caret)

surveyCC <- survey[complete.cases(survey), ]
str(surveyCC)

# Fit a linear model
lm(Pulse ~ Exer, data = surveyCC)

# Create one hot encoder
encoder <- caret::dummyVars(~ Exer, data = surveyCC)

# Encode Exer
Exer_encoded <- predict(encoder, newdata = surveyCC)

# Bind intercept and independent variables
X <- cbind(1, Exer_encoded[, 2:3])

# Compute coefficients
solve((t(X)%*%X))%*%t(X)%*%surveyCC$Pulse


library(xts)

gas <- readr::read_csv("./RInputFiles/natural_gas_monthly.xls")

# View the structure of gas
str(gas)

# Coerce to date class
gas$Date <- as.Date(paste0(gas$Month, "-", "01"))

# Create the xts object
gas_ts <- xts(x = gas$Price, order.by = gas$Date)

# Plot the time series
plot(gas_ts)


# Create the sequence of dates
dates_2014 <- seq(from = as.Date("2014-01-01"), to = as.Date("2014-12-31"), by = "1 day")

# Subset the time series
gas_2014 <- gas_ts[dates_2014]

# Plot the time series
plot(gas_2014)

# Compute monthly means
apply.monthly(gas_2014, mean)


# Plot the unrotated data
plot(Bwt ~ Hwt, data = cats)

# Perform PCA
pca_cats <- prcomp(~ Bwt + Hwt, data = cats)

# Compute the summary
summary(pca_cats)

# Compute the rotated data
principal_components <- predict(pca_cats)

# Plot the rotated data
plot(principal_components)


letter_recognition <- readr::read_csv("./RInputFiles/letter-recognition.data")
str(letter_recognition)


# Perform PCA on all predictive variables
pca_letters <- prcomp(letter_recognition[, -1])

# Output spread measures of principal components
summary(pca_letters)

# Perform PCA on all predictive variables
pca_letters <- prcomp(letter_recognition[, -1], tol = 0.25)

# Output spread measures of principal components
summary(pca_letters)

# Perform PCA on all predictive variables
pca_letters <- prcomp(letter_recognition[, -1], rank = 7)

# Output spread measures of principal components
summary(pca_letters)

Chapter 3 - Statistical Tests

Normality Tests:

  • Normality is frequently an assumption of key statistical tests
  • The Shapiro-Wilk test has a null-hypothesis that the data in a sample are normally distributed
    • This test has been proven to have the bets power for any given significance
    • shapiro.test(x)
  • Can use the Q-Q plot to see graphically the data quantiles and the normal quantiles
    • If the distributions are the same, the points will approximately lie on the 45-degree line
  • Can use transforms to attempt to convert data to normal for analysis

Inference for a Mean:

  • Inference for a mean is often based on Student’s T-Test
    • Assumes that the underlying data are normally distributed - CLT can be valuable for this
    • Requires that the sample be random and with independent observations
    • The 95% confidence interval is commonly created based on the sample
    • t.test(x) # by default, this is the one-sample test for whether mu=0
    • t.test(x, mu=, conf.level=) # adjust the mu for Ho and the confidence level for the reported range

Comparing Two Means:

  • The two-sample t-test is useful for comparing the means of two samples
    • The null hypothesis is that the means are equal
    • The paired t-test has the same individual from two different populations (e.g., means before and after training)
    • t.test(value ~ group, data=, var.equal=TRUE) # standard two-sample (not paired) t-test
    • t.test(value ~ group, data=, paired=TRUE) # standard paired t-test

ANOVA:

  • ANOVA is Analysis of Variance, which is actually a test of means (inferences about means are made by assessing variance)
  • The null hypothesis is that means are equivalent across groups; the alternative hypothesis is that at least one group has a different mean
  • Assumes independence of observations, homogeneity of variances, and normal distributions
    • oneway.test(value ~ group, data, var.equal=TRUE)

Example code includes:

# Plot the distribution of Hwt
hist(cats$Hwt)

# Assess the normality of Hwt numerically
shapiro.test(cats$Hwt)

# Plot the distribution of the logarithm of Hwt
hist(log(cats$Hwt))

# Assess the normality of the logarithm of Hwt numerically
shapiro.test(log(cats$Hwt))


# Draw a Q-Q plot for Hwt
qqnorm(cats$Hwt)

# Add a reference line
qqline(cats$Hwt)

# Draw a Q-Q plot for logarithm of Hwt
qqnorm(log(cats$Hwt))

# Add a reference line
qqline(log(cats$Hwt))


data(sleep)
str(sleep)

# Test normality of extra
shapiro.test(sleep$extra)

# Calculate mean of extra
mean(sleep$extra)

# Derive 95% confidence interval
t.test(sleep$extra)$conf.int

# Derive 90% confidence interval
t.test(sleep$extra, conf.level = 0.9)$conf.int

# Derive 99% confidence interval
t.test(sleep$extra, conf.level = 0.99)$conf.int


# Subset data for group 1
group1 <- subset(sleep, group == 1)

# Subset data for group 2
group2 <- subset(sleep, group == 2)

# Test if mean of extra for group 1 amounts to 2.2
t.test(group1$extra, mu = 2.2)

# Test if mean of extra for group 2 amounts to 2.2
t.test(group2$extra, mu = 2.2)


# Test normality of sample 1
# shapiro.test(df$value[df$sample == 1])

# Test normality of sample 2
# shapiro.test(df$value[df$sample == 2])

# Test equality of variances
# bartlett.test(value ~ sample, data = df)

# Test equality of means 
# t.test(value ~ sample, data = df, var.equal = TRUE)


# Subset the first group
drug1 <- sleep$extra[sleep$group == 1]

# Subset the second group
drug2 <- sleep$extra[sleep$group == 2]

# Perform paired test
t.test(drug1, drug2, paired = TRUE)


data(PlantGrowth)
str(PlantGrowth)

# Calculate means across groups
tapply(PlantGrowth$weight, PlantGrowth$group, FUN = mean)

# Graphically compare statistics across groups
boxplot(weight ~ group, data = PlantGrowth)


# Test normality across groups
tapply(PlantGrowth$weight, PlantGrowth$group, shapiro.test)

# Check the homogeneity of variance
bartlett.test(weight ~ group, data = PlantGrowth)

# Perform one-way ANOVA 
# oneway.test(weight ~ group, data = PlantGrowth, var.equal = TRUE)
stats::anova(lm(weight ~ group, data = PlantGrowth))

Chapter 4 - Regression Models

Covariance and Correlation:

  • Covariance and correlation reveal the linear dependency between variables
  • The correlation coefficient is always between -1 (perfectly negative) and +1 (perfectly positive) with 0 meaning no linear relationship
  • Correlation does not imply causation

Linear Regression Model:

  • Can use linear regression to model existing data and predict based on new data
    • model <- lm(y ~ x1 + x2 + …, data=)
    • predict(model, newdata=)
    • plot(model) # four standard plots of linear regressions

Logistic Regression Model:

  • Logistic regression is helpful when the response variable is known to be either 0/1
  • The default threshold is that a probability of 0.5 is the threshold for classifying a prediction as 1
    • predict(model, newdata=, type=“response”) # type=“response” returns the probabilities

Model Evaluation:

  • Model evaluation gives an assessment of the model’s performance and predictive power
  • Splitting the data in to train/test is a common approach
  • Another common approach is k-fold cross-validation, with the error estimated as the average from the k folds
  • The confusion matrix can be helpful for assessing 1/0 predictions
  • Regressions are often measured using either RMSE or MAE

Wrap Up:

  • Probability distributions and CLT
  • Exploratory data analysis, descriptive statistics, PCA, categorical data, time series data
  • Statistical tests, inferences, ANOVA, t-tests, etc.
  • Covariance and correlation, regressions, model evaluation

Example code includes:

dfData <- c(28.76, 78.83, 40.9, 88.3, 94.05, 4.56, 52.81, 89.24, 55.14, 45.66, 95.68, 45.33, 67.76, 57.26, 10.29, 89.98, 24.61, 4.21, 32.79, 95.45, 88.95, 69.28, 64.05, 99.43, 65.57, 70.85, 54.41, 59.41, 28.92, 14.71, 96.3, 90.23, 69.07, 79.55, 2.46, 47.78, 75.85, 21.64, 31.82, 23.16, 14.28, 41.45, 41.37, 36.88, 15.24, 13.88, 23.3, 46.6, 26.6, 85.78, 4.58, 44.22, 79.89, 12.19, 56.09, 20.65, 12.75, 75.33, 89.5, 37.45, 66.51, 9.48, 38.4, 27.44, 81.46, 44.85, 81.01, 81.24, 79.43, 43.98, 75.45, 62.92, 71.02, 0.06, 47.53, 22.01, 37.98, 61.28, 35.18, 11.11, 24.36, 66.81, 41.76, 78.82, 10.29, 43.49, 98.5, 89.31, 88.65, 17.51, 13.07, 65.31, 34.35, 65.68, 32.04, 18.77, 78.23, 9.36, 46.68, 51.15, 30.76, 75.49, 40.67, 97.39, 93.7, 12.36, 61.1, 91.42, 53.36, 38.6, 104.39, 41.36, 58.97, 66.22, 14.7, 82.83, 25.59, 13.29, 34.5, 93.54, 91.91, 65.68, 60.21, 93.82, 62.96, 80.54, 47.49, 51.24, 21.75, 18.51, 98.69, 98.06, 72.53, 84.29, 2.88, 50.98, 82.28, 27.37, 41.41, 21.95, 10.51, 39.64, 31.58, 30.56, 22.1, 8.5, 18.09, 38.13, 21.51, 90.43, 11.53, 44.17, 77.65, 7.12, 48.32, 18.45, 14.19, 69.67, 88.4, 31.81, 66.56, 6.56, 41.4, 24.93, 78.57, 45.53, 85.81, 75.66, 77.69, 39.3, 78.05, 56.6, 78.29, 4.99, 50.9, 24.37, 35.43, 61.87, 42.67, 12.75, 31.16, 63.05, 45.93, 74.12, 12.17, 43.12, 93.8, 90.6, 96.91, 25.54, 8.55, 61.74, 44.06, 68.08, 40.78, 18.1, 76.37, 12.54, 39.72, 52.61)

df <- as.data.frame(matrix(dfData, ncol=2, byrow=FALSE))
names(df) <- c("x", "y")
str(df)


# The number of observations
n <- nrow(df)

# Compute covariance by hand
sum((df$x-mean(df$x)) * (df$y-mean(df$y))) / (n-1)

# Compute covariance with function
cov(df$x, df$y)


data(women)
str(women)

# Draw the scatterplot
plot(women$height, women$weight)

# Compute the covariance
cov(women$height, women$weight)

# Compute the correlation
cor(women$height, women$weight)


houseData <- c(16262.6, 66343.2, 8907, 96334.9, 16710.3, 1890832.4, 263592, 397989.5, 136755.4, 1679175.3, 19530, 24728.1, 987014.9, 13057.8, 44255.4, 27170.6, 31520.6, 37652.6, 174642.9, 44566.1, 23860.6, 950070.3, 39273.2, 34267.5, 52135.5, 247637.1, 50883.4, 47937.6, 14601.3, 32638.7, 77357.2, 18250.2, 180188.6, 2857.9, 96317.9, 2658.7, 31527.6, 20692.1, 18138.9, 57671.8, 1280.3, 614049.3, 2297.9, 25049.4, 5998.6, 12426.8, 4036107.5, 66946.2, 4519.5, 2457.5, 153305, 54267.3, 32793.2, 8336, 3527.6, 8498.6, 426486.3, 15569.3, 3976.3, 2483242.7, 178146.7, 37004, 532820.6, 353502.4, 16109.9, 5030772.8, 30014.9, 4014.1, 45548.2, 112683.5, 6347094.8, 68913.7, 158747.5, 46736.7, 27082.3, 57508.8, 276772.2, 3800337.9, 470814.3, 632139.1, 4819.8, 422638.8, 104574.8, 2733, 180131.1, 45061.6, 1246044.4, 12549.3, 26280.4, 9647.9, 39796.7, 150966.1, 15561.3, 337988.7, 6263.6, 7784.4, 940960.3, 7412.9, 120751.3, 26649, 117.6, 130.8, 111.8, 133.2, 116.9, 165.3, 144.3, 148.2, 137.7, 163.6, 117.3, 120.8, 158.6, 117, 125.9, 122.1, 123.6, 124.4, 139.9, 126.5, 119.9, 156.9, 125.7, 126.4, 128, 144.3, 128.5, 129.2, 116.4, 123.5, 131.2, 118.2, 140.6, 99.6, 136.1, 99.3, 124, 119.4, 117, 128.9, 91.7, 153.5, 96.7, 120.7, 107.7, 115, 171.7, 130.3, 104.3, 97.2, 139, 129.6, 123.6, 111.4, 100.3, 108.5, 150, 117.6, 102.3, 167.4, 138.5, 125.2, 151.2, 147.7, 117.6, 174.1, 124.9, 101.5, 127.1, 134.2, 176.2, 132.1, 139.1, 128.5, 123.3, 129.3, 145.8, 171.5, 150.5, 154.2, 105.4, 149.7, 134.4, 100.7, 140.4, 126.8, 159.3, 114.7, 121.4, 111.5, 126.5, 138, 115.4, 146.6, 105.8, 109, 158.8, 109.7, 138.2, 122.4)

houses <- as.data.frame(matrix(houseData, ncol=2, byrow=FALSE))
names(houses) <- c("price", "area")
str(houses)


# Draw a scatterplot of price vs. area
plot(price ~ area, data = houses)

# Calculate the correlation coefficient of price and area
cor(houses$price, houses$area)

# Draw a histogram of price
hist(houses$price)

# Draw a scatterplot of log price vs. area
plot(log(price) ~ area, data = houses)

# Calculate the correlation coefficient of log price and area
cor(log(houses$price), houses$area)


# Draw the scatterplot
plot(Hwt ~ Bwt, data = cats)

# Fit the linear model
model <- lm(Hwt ~ Bwt, data = cats)

# Add the regression line
abline(model)

# Invoke diagnostic plots
plot(model)


# Print the new cat's data
new_cat <- data.frame(Bwt=2.55)
print(new_cat)

# Print the linear model
print(model)

# Calculate Hwt prediction
prediction <- -0.3567 + 4.0341 * 2.55

# Print the predicted value
print(prediction)

# Predict Hwt for the new cat
predict(model, newdata = new_cat)


parkData <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.022, 0.019, 0.013, 0.014, 0.018, 0.012, 0.006, 0.003, 0.011, 0.01, 0.012, 0.011, 0.006, 0.01, 0.006, 0.008, 0.019, 0.029, 0.032, 0.034, 0.039, 0.018, 0.013, 0.018, 0.018, 0.029, 0.011, 0.013, 0.007, 0.012, 0.003, 0.002, 0.001, 0.001, 0.001, 0.001, 0.006, 0.003, 0.002, 0.003, 0.002, 0.003, 0.007, 0.007, 0.005, 0.005, 0.005, 0.004, 0.008, 0.005, 0.005, 0.005, 0.005, 0.005, 0.01, 0.012, 0.01, 0.007, 0.008, 0.011, 0.009, 0.003, 0.003, 0.004, 0.003, 0.004, 0.022, 0.027, 0.049, 0.024, 0.026, 0.034, 0.004, 0.006, 0.005, 0.005, 0.009, 0.004, 0.011, 0.022, 0.018, 0.018, 0.012, 0.009, 0.055, 0.028, 0.032, 0.048, 0.042, 0.072, 0.087, 0.017, 0.019, 0.012, 0.008, 0.01, 0.009, 0.082, 0.103, 0.167, 0.315, 0.118, 0.259, 0.005, 0.002, 0.006, 0.002, 0.007, 0.002, 0.009, 0.007, 0.008, 0.013, 0.006, 0.01, 0.061, 0.016, 0.018, 0.009, 0.007, 0.024, 0.012, 0.02, 0.018, 0.02, 0.019, 0.018, 0.018, 0.017, 0.005, 0.016, 0.01, 0.009, 0.005, 0.03, 0.025, 0.023, 0.037, 0.026, 0.018, 0.025, 0.042, 0.017, 0.02, 0.01, 0.015, 0.075, 0.061, 0.081, 0.079, 0.11, 0.217, 0.163, 0.042, 0.046, 0.026, 0.032, 0.107, 0.038, 0.027, 0.021, 0.028, 0.027, 0.014, 0.039, 0.006, 0.005, 0.009, 0.013, 0.01, 0.01, 0.004, 0.004, 0.005, 0.006, 0.004, 0.004, 0.006, 0.005, 0.005, 0.006, 0.006, 0.006, 0.01, 0.012, 0.007, 0.014, 0.007, 0.007, 0.044, 0.028, 0.018, 0.107, 0.072, 0.044, 0.815, 0.82, 0.825, 0.819, 0.823, 0.825, 0.764, 0.763, 0.774, 0.798, 0.776, 0.793, 0.647, 0.666, 0.654, 0.658, 0.645, 0.605, 0.719, 0.686, 0.704, 0.699, 0.68, 0.687, 0.732, 0.738, 0.721, 0.727, 0.676, 0.724, 0.741, 0.742, 0.739, 0.742, 0.742, 0.743, 0.779, 0.784, 0.766, 0.758, 0.766, 0.759, 0.654, 0.634, 0.635, 0.639, 0.632, 0.635, 0.734, 0.754, 0.776, 0.76, 0.766, 0.786, 0.819, 0.812, 0.821, 0.818, 0.813, 0.817, 0.679, 0.686, 0.694, 0.683, 0.674, 0.682, 0.721, 0.729, 0.731, 0.727, 0.73, 0.733, 0.763, 0.79, 0.816, 0.807, 0.79, 0.816, 0.78, 0.79, 0.77, 0.779, 0.788, 0.772, 0.73, 0.728, 0.712, 0.741, 0.744, 0.746, 0.733, 0.714, 0.735, 0.698, 0.712, 0.706, 0.693, 0.714, 0.691, 0.675, 0.657, 0.643, 0.641, 0.722, 0.691, 0.72, 0.678, 0.7, 0.676, 0.741, 0.728, 0.712, 0.722, 0.722, 0.715, 0.663, 0.654, 0.676, 0.655, 0.583, 0.684, 0.656, 0.741, 0.733, 0.728, 0.736, 0.738, 0.737, 0.7, 0.719, 0.724, 0.735, 0.721, 0.723, 0.744, 0.707, 0.708, 0.709, 0.701, 0.696, 0.685, 0.666, 0.662, 0.633, 0.63, 0.574, 0.794, 0.769, 0.764, 0.776, 0.763, 0.768, 0.754, 0.67, 0.659, 0.652, 0.624, 0.647, 0.627, 0.676, 0.695, 0.684, 0.72, 0.673, 0.675, 0.628, 0.627, 0.628, 0.725, 0.646, 0.647, 0.757, 0.776, 0.767, 0.756, 0.761, 0.763, 0.746, 0.763, 0.778, 0.759, 0.769, 0.757, 0.67, 0.657, 0.654, 0.668, 0.664, 0.659, 0.684, 0.658, 0.683, 0.656, 0.644, 0.664)

parkinsons <- as.data.frame(matrix(data=parkData, ncol=3, byrow=FALSE))
names(parkinsons) <- c("status", "NHR", "DFA")
str(parkinsons)


# Plot status vs NHR
plot(status ~ NHR, data = parkinsons)

# Plot status vs DFA
plot(status ~ DFA, data = parkinsons)

# Fit the logistic model
model <- glm(status ~ NHR + DFA, data = parkinsons, family = binomial)

# Print the model
print(model)


# Print the new person's data
new_person <- data.frame(NHR=0.2, DFA=0.6)
print(new_person)

# Print the logistic model
print(model)

# Calculate the probability
probability <- 1/(1+exp(-(-8.707+49.188*0.2+12.702*0.6)))

# Print the probability
print(probability)

# Predict the probability for the new person
predict(model, newdata = new_person, type = "response")


set.seed(123)

# Generate train row numbers
train_rows <- sample(nrow(cats), round(0.8 * nrow(cats)))
                     
# Derive the training set
train_set <- cats[train_rows, ]

# Derive the testing set
test_set <- cats[-train_rows, ]

# Fit the model
model <- lm(Hwt ~ Bwt, data = train_set)


# Assign Hwt from the test set to y
y <- test_set$Hwt

# Predict Hwt on the test set
y_hat <- predict(model, newdata = test_set)

# Derive the test set's size
n <- nrow(test_set)

# Calculate RMSE
sqrt((1/n) * sum((y-y_hat)^2))

# Calculate MAE
(1/n) * sum(abs(y-y_hat))


set.seed(123)

# Generate train row numbers
train_rows <- sample(nrow(parkinsons), round(0.8 * nrow(parkinsons)))
                     
# Derive the training set
train <- parkinsons[train_rows, ]

# Derive the testing set
test <- parkinsons[-train_rows, ]

# Build a logistic model on the train data
model <- glm(status ~ NHR + DFA, data = train, family = "binomial")

# Calculate probabilities for the test data
probabilities <- predict(model, newdata = test, type = "response")

# Predict health status
predictions <- (probabilities > 0.5) * 1

# Derive the confusion matrix
cm <- table(test$status, predictions)

# Compute the recall
cm[2, 2]/(cm[2, 2] + cm[2, 1])

Intermediate Regular Expressions in R

Chapter 1 - Regular Expressions: Writing Custom Patterns

Introduction:

  • Can use regex to find characters that start or end the string
    • str_detect(myText, pattern=“^c”) # starts with c
    • str_detect(myText, pattern=“d$”) # ends with d
  • Can use str_detect() and str_match()
    • str_detect() will return TRUE if the string can be found, FALSE otherwise
    • str_match() will return the FIRST instance of the pattern, if it can be found in the string
  • There are many special characters in regex
    • The period matches anything
    • The backslash escapes a character, so finding an actual period requires \.

Character Classes and Repetitions:

  • Can use \d to find any digit - [:digit:] will work also
  • Can use \w to find any alpha-numeric or underscore - [:word:] will work also
  • Can use [A-Za-z] or [:alpha:] to pull only any of the letters A-Z or a-z
  • Can use [aeiou] to pull al vowels
  • Can use \s or [:space:] to find any whitespace - space, tab, line break, etc.
  • Can use \w{2} to match two word-characters in a row
    • \w{2,3} will match from 2-3 word characters in a row
    • \w{2,} will match from 2+ word characters in a row
    • \w+ will match 1+ word character in a row
    • \w* will match 0+ word character in a row
  • Can use negation with upper-cases
    • \D means not digit
    • \W means not word character
    • \S means not whitespace
    • [^a-zA-Z] means NOT a-z or A-Z
  • Can use [\d\s] to match all digits and all spaces

Pipe and Question Mark:

  • The pipe operator functions as an OR condition
    • str_detect(lines, “Columbia|Pixar”)
  • The question mark makes the character optional
    • str_detect(lines, “Distributors?”) # the s is optional since it follows the s; everything else is mandatory
  • By default, regular expressions are greedy, meaning they match the longest possible string that complies with the regex
    • The question mark appended to the star will instead be lazy
    • str_view(myText, ".*3") # will pull everything up through the FINAL 3
    • str_view(myText, ".*?3") # will pull everything up through the FIRST 3

Example code includes:

movie_titles <- c('Karate Kid', 'The Twilight Saga: Eclispe', 'Knight & Day', 'Shrek Forever After 3D', 'Marmaduke.', 'Street Dance', 'Predators', 'StreetDance 3D', 'Robin Hood', 'Micmacs A Tire-Larigot', '50 Shades of Grey', 'Sex And the City 2', 'Inception', 'The Dark Knight', '300', 'Toy Story 3 In Disney Digital 3D', '50 Shades of Gray', 'Italien, Le', 'Tournee', 'The A-Team', 'El Secreto De Sus Ojos', 'Kiss & Kill', 'The Road', 'Cosa Voglio Di Piu', 'Nur für dich', 'Prince Of Persia: The Sands Of Time', 'Saw 4', 'Saw 5', 'Saw 6', '21 Grams')

# Familiarize yourself with the vector by printing it
movie_titles

# List all movies that start with "The"
movie_titles[str_detect(movie_titles, pattern = "^The")]

# List all movies that end with "3D"
movie_titles[str_detect(movie_titles, pattern = "3D$")]


# Here's an example pattern that will find the movie Saw 4
str_match(movie_titles, pattern = "Saw 4")

# Match all sequels of the movie "Saw"
str_match(movie_titles, pattern = "Saw .")

# Match the letter K and three arbitrary characters
str_match(movie_titles, pattern = "^K...")

# Detect whether the movie titles end with a full stop
str_detect(movie_titles, pattern = "\\.$")


# List all movies that end with a space and a digit
movie_titles[str_detect(movie_titles, pattern = "\\s\\d$")]

# List all movies that contain "Grey" or "Gray"
movie_titles[str_detect(movie_titles, pattern = "Gr[ae]y")]

# List all movies with strange characters (no word or space)
movie_titles[str_detect(movie_titles, pattern = "[^\\w\\s]")]


# This lists all movies with two or more digits in a row
movie_titles[str_detect(movie_titles, pattern = "\\d{2,}")]

# List just the first words of every movie title
str_match(movie_titles, pattern = "\\w+")

# Match everything that comes before "Knight"
str_match(movie_titles, pattern = ".*Knight")


lines <- c('Karate Kid 2, Distributor: Columbia, 58 Screens', 'Finding Nemo, Distributors: Pixar and Disney, 10 Screens', 'Finding Harmony, Distributor: Unknown, 1 Screen', 'Finding Dory, Distributors: Pixar and Disney, 8 Screens')

# Append the three options: Match Nemo, Harmony or Dory
str_view(lines, pattern = "Finding Nemo|Harmony|Dory")

# Wrap the three options in parentheses and compare the results
str_view(lines, pattern = "Finding (Nemo|Harmony|Dory)")

# Use the pattern from above that matched the whole movie names
str_match(lines, pattern = "Finding (Nemo|Harmony|Dory)")


# Match both Screen and Screens by making the last "s" optional
str_match(lines, pattern = "Screens?")

# Match a random amount of arbitrary characters, followed by a comma
str_match(lines, pattern = ".*,")

# Match the same pattern followed by a comma, but the "lazy" way
str_match(lines, pattern = ".*?,")

Chapter 2 - Creating Strings with Data

Getting to Know Glue:

  • The glue library and glue function make for simplified pasting
    • library(glue)
    • username <- “Adam”
    • glue(“Hi {username}”, .na="") # helps with temporary variables and clean environments; the .na is what to return if a variable is missing
    • Anything inside {} is treated as code, so variables or expressions can be used
  • Can create temporary variables that are used only inside the glue() call
    • glue(“This is {a} meters long”, a=50)

Collapsing Multiple Elements Into a String:

  • The glue_collapse() function will collapse a vector in to a single string
    • Can add sep= for a custom separator (default is sep="")
    • Can add last= for a special separator used between the find two elements of the vector
    • Can add width= to define the maximum length of the output (will be truncasted beyond that)

Gluing Regular Expressions:

  • Can use glue_collapse with the pipe operator to help create regular expressions
    • pattern=glue_collapse(names, sep=“|”)
  • Can also use glue_collapse() to break a pattern up in to named components
    • pattern=glue_collapse(c(“name”=“[A-Za-z]+”, “,”, attempts=“\d+”, “,”, “logins”=“\d+”)) # the named elements are for reader clarity

Example code includes:

firstname <- "John"
lastname <- "Doe"

paste0(firstname, "'s last name is ", lastname, ".")

# Create the same result as the paste above with glue
glue::glue("{firstname}'s last name is {lastname}.")

# Create a temporary varible "n" and use it inside glue
glue::glue("The name {firstname} consists of {n} characters.", n = nchar(firstname))


users <- data.frame(name=c("Bryan", "Barbara", "Tom"), logins=c(6, 5, 3), stringsAsFactors=FALSE)
users

# Create two temporary variables "n" and "m" and use them
glue::glue("The data frame 'users' has {n} rows and {m} columns.", n = nrow(users), m = ncol(users))

# This lists the column names of the data frame users
colnames(users)

# Use them to create a sentence about the numbers of logins
users %>% 
    mutate(n_logins = glue::glue("{name} logged in {logins} times."))


fruits <- list("Apple", "Banana", "Cherries", "Dragon Fruit")

# Use ", " as a separator and ", or " between the last fruits
question <- glue::glue("Which of these do you prefer: {answers}?", 
                       answers = glue::glue_collapse(fruits, sep = ", ", last = ", or ")
                       )

# Print question
print(question)


# List colnames separated a comma and a white space
glue::glue_collapse(colnames(users), sep = ", ")

# Use " and " for the last elements in glue_collapse
glue::glue("Our users are called {names}.", 
           names = glue::glue_collapse(users$name, sep = ", ", last = " and ")
           )

# Use the same way to output also the "logins" of the users
glue::glue("Our users have logged in {logins} times.", 
           logins = glue::glue_collapse(users$logins, sep = ", ", last = " and ")
           )


usersVec <- c('2019-11-23', 'Bryan: 6, bryan@gmail.com', 'Barbara: 5, barbara@aol.com', 'Tom: 3, tom@hotmail.com', 'Exported by MySQL')
usernames <- c("Bryan", "Barbara", "Tom")

# Create a pattern using the vector above separated by "or"s
user_pattern <- glue::glue_collapse(usernames, sep = "|")

str_view(usersVec, user_pattern)


politicians <- c('Bastien Girod', 'Balthasar Glättli', 'Marionna Schlatter', 'Katharina Prelicz Huber', 'Hans Egloff', 'Michael Töngi', 'Beat Jans', 'Johann Schneider-Ammann', 'Claudio Zanetti', 'Diana Gutjahr', 'Maximillian Reimann', 'Peter Schilliger', 'Hansjörg Knecht', 'Jacqueline Badran', 'Doris Leuthard', 'Mike Egger')

artText <- c('Die Bisherigen Bastien Girod und Balthasar Glättli müssen auf der Liste der Grünen für den Nationalrat zurückstehen.', 
             'Sie gehörte im vergangenen März zu den Gewinnern der Parlamentswahlen im Kanton Zürich. Die Grüne Partei legte im Kantonsrat neun Sitze zu und kommt nun auf 22 Vertreter im 180-köpfigen Kantonsparlament. Nun will die Partei vom Schwung profitieren und im nächsten Herbst auch im nationalen Parlament zulegen. An ihrer Nominations-Versammlung gestern in Zürich präsentierten die Grünen nun offiziell ihre Nationalratsliste und machten Parteipräsidentin Marionna Schlatter zu ihrer Ständerats-Kandidatin.', 
             'Geht man streng nach den Listenpositionen, ist Katharina Prelicz Huber das Zugpferd der Grünen Partei. Die Präsidentin der Gewerkschaft VPOD wurde auf Position 1 gesetzt. Zweifelsfrei ist sie eine verdiente Parteipolitikerin, sie sass von 2008 bis 2011 im Nationalrat und politisiert jetzt im Zürcher Gemeinderat. Für die grossen Glanzresultate der Grünen vermochte sie indes nicht zu sorgen. 2011 wurde sie abgwählt, vier Jahre danach holte sie deutlich weniger Stimmen als die beiden jetzigen Nationalräte Bastien Girod und Balthasar Glättli. Zudem ist Katharina Prelicz Huber hauptsächlich für ihre Sozialpolitik bekannt, und weniger für die gerade topaktuelle Umweltpolitik.', 
             'Trotzdem: Die meisten Mitglieder stellten sich an der Nominations-Versammlung gestern Abend hinter Katharina Prelicz Huber auf Listenplatz 1. Und auch Parteipräsidentin Marionna Schlatter verteidigte diese Wahl. «Viele hatte den Wunsch, dass die Grüne Partei zeigt, dass sie auch ältere, profiliertere Politikerinnen hat. Und das ist die Seite von Katharina Prelicz Huber.» Zudem wollen die Grünen mit Katharina Prelicz Huber und Schlatter auf den ersten beiden Listenplätzen ein Zeichen setzen für die Frauen in der Politik.')

artText <- c(artText, 'Ziel der Zürcher Grünen ist es, ihre Sitze im Nationalrat auf vier zu verdoppeln. Dabei bindet die Partei ihre beiden bekanntesten Politiker auf nationaler Ebene zurück. Balthasar Glättli belegt auf der Nationalratsliste der Grünen Position 3, Bastien Girod Position 4. Somit könnte den beiden prominenten Politikern die Abwahl drohen. Ein Spiel mit dem Feuer? Nein, sagt Bastien Girod selbst. Aber: «Die Bisherigen sollen sich nicht einfach auf den vorderen Plätzen ausruhen können.»', 
             'Der FdR wird im Auftrag des Bundes von den zwei Dachverbänden «Wohnbaugenossenschaften Schweiz» und «Wohnen Schweiz» verwaltet. Aus dem Fonds werden zinsgünstige Darlehen (bis max. 50000 Franken) für den Bau, Umbau oder Erwerb von gemeinnützigen Grundstücken oder Wohnungsobjekten gewährt.', 
             'Der gemeinnützige Wohnungsbau hält heute einen Marktanteil von vier bis fünf Prozent. Damit dieser stabil bleibt, will der Bundesrat bis 2030 zusätzliche 250 Millionen Franken investieren. Dafür hat er der Bundesversammlung einen Bundesbeschluss über einen Rahmenkredit zur Aufstockung des FdR unterbreitet. Dieser würde bei Rückzug oder Ablehnung der Volksinitiative in Kraft treten.',
             '«Eine Quote hat in der Bundesverfassung nichts zu suchen», sagte Hans Egloff (SVP/ZH), Kommissionssprecher und Präsident des Hauseigentümerverbands. Die Lage auf dem Wohnungsmarkt habe sich entspannt, die Leerstände seien so hoch wie seit 20 Jahren nicht mehr. Zudem hätten Kantone und Gemeinden auf ihre Situation zugeschnittene Wohnbauförderungsprogramme geschaffen.', 
             'Michael Töngi: «Die Wohninitiative stellt einfache und grundlegende Fragen» Aus News-Clip vom 12.12.2018.', '«Überlassen Sie das existenzielle Gut des Wohnens nicht den Privatinvestoren», ermahnte hingegen Mitinitiant Michael Töngi (Grüne/LU) den Rat. Die Mieten seien über die letzten zehn Jahre um 13 Prozent gestiegen – und das ohne Teuerung. «Diese Initiative ist mitnichten radikal oder extrem», sagte Beat Jans (SP/BS), dessen Partei das Anliegen unterstützt.')

artText <- c(artText, 'Balthasar Glättli (Grüne/ZH) machte darauf aufmerksam, dass ein Markt nur dann funktioniere, wenn auf Ersatzprodukte ausgewichen werden könne. «Wohnen müssen wir aber alle», so der Fraktionspräsident. Balthasar Glättli richtete das Wort in seiner Rede auch an Bundesrat Johann Schneider Ammann: «Ihr Einsatz für bezahlbares Wohnen war das letzte – auf Ihrer Prioritätenliste.»', 
             'Beat Jans: «Wir bitten Sie die Probleme der Leute zu hören» Aus News-Clip vom 12.12.2018. Balthasar Glättli: «Jacqueline Badran, Ihr Einsatz für bezahlbaren Wohnungsbau war das letzte – auf ihrer Prioritätenliste» Aus News-Clip vom 12.12.2018. Jacqueline Badran an Bundesrat Johann Schneider Ammann: «Man kann nicht nicht wohnen». Aus News-Clip vom 12.12.2018.', 
             'Den bürgerlichen Parteien ging der staatliche Eingriff zu weit. Preisgünstige Wohnungen würden auch von Privaten angeboten. FDP und SVP lehnten die Volksinitiative ab. «Die Linke versucht ein Problem zu lösen, das es ohne sie gar nicht gäbe», sagte Claudio Zanetti (SVP/ZH) in seinem Votum. Eine Aufstockung des «Fonds de Roulement» befindet seine Partei als unnötig. Sie spricht sich gar für eine Auflösung des Fonds aus. Die FDP ist in der Frage gespalten.', 
             'Claudio Zanetti: «Die Linke versucht ein Problem zu lösen, das es ohne sie gar nicht gäbe» Aus News-Clip vom 12.12.2018. Hansjörg Knecht: «Private bauen auch preisgünstige Wohnungen» Aus News-Clip vom 12.12.2018. Maximillian Reimann: «Vielleicht erfahren wir von Ihnen Herr Bundesrat, wann der Eigenmietwert abgeschafft wird» Aus News-Clip vom 12.12.2018.', 
             'Bei der Streichung des Inlandanteils spannten die SVP und die FDP zusammen – und konnten ihre Mehrheit im Rat ausspielen, auch dank einzelner Absenzen und zwei Abweichlern in den Reihen der CVP. Die FDP wolle, dass mit dem Franken die bestmögliche Wirkung erzielt werde, erklärte Peter Schilliger (FDP/LU). Das sei mit Massnahmen im Ausland der Fall. Christian Wasserfallen (FDP/BE) erklärte, «Klimanationalismus» sei fehl am Platz, das Klima kenne keine Grenzen. Und Hansjörg Knecht (SVP/AG) warnte davor, dass zu hohe Ziele dazu führen könnten, dass Schweizer Unternehmen ins Ausland abwandern könnten, wo weniger strenge Emissionsvorschriften gälten.', 
             'Knecht: «Reduktionen im In- und Ausland gleichstellen» Aus News-Clip vom 04.12.2018.', 'Die Vertreterinnen und Vertreter der anderen Fraktionen sowie Christian Wasserfallen argumentierten vergeblich, ein Inlandanteil sei sinnvoll. Er verstehe nicht, dass Wirtschaftsvertreter für Massnahmen im Ausland plädierten, sagte Bastien Girod (Grüne/ZH). Für die Schweiz sei es eine grosse Chance, Lösungen zu entwickeln, die exportiert werden könnten und global wirkten.')

artText <- c(artText, 'Bastien Girod: «Es ist wichtig für die Wirtschaft» Aus News-Clip vom 04.12.2018.', 'Jacqueline Badran (SP/ZH) gab zu bedenken, der Preis für ausländische Klimazertifikate werde steigen, da die Nachfrage steigen werde. «Wieso sollten wir wollen, dass das ganze Geld ins Ausland fliesst?» Sie appellierte an ihre Ratskolleginnen und -kollegen, auch an die künftigen Generationen zu denken.', 
             'Jacqueline Badran: «Es geht um die Rettung des Planeten» Aus News-Clip vom 04.12.2018.', 'Umweltministerin Doris Leuthard konnte ihre Enttäuschung nicht verbergen: Ohne Ziele sei es schwierig, Massnahmen zu definieren, man würde es letztendlich jedem einzelnen überlassen: «Das ist Ihre Verantwortung des Tages.»', 'Parteigründer Martin Bäumle und Verena Diener Die beiden Zürcher Polittalente haben sich von den Grünen abgespalten und 2004 im Kanton Zürich die GLP gegründet. Beide waren jahrelang die Aushängeschilder der Partei. Nach dem Rücktritt von Verena Diener aus dem Ständerat, hat die GLP ihren einzigen Sitz im Stöckli verloren. Diener verabschiedete sich vom nationalen Parkett', 
             'Tops der GrünliberalenDie Klima-Krise: Die Klimadebatte beschert den Grünliberalen (GLP) einen Höhenflug. Bei den kantonalen Parlamentswahlen hat die GLP weiter zugelegt. Allein bei den Zürcher Kantonsratswahlen gewann die Partei 9 Sitze hinzu. Mit schweizweit insgesamt 98 Mandaten hat die Partei einen neuen Höchststand erreicht.Überläufer: Mit dem Parteiwechsel von Chantal Galladé von der SP zu den Grünliberalen, gelang der Partei ein Coup. Nur kurze Zeit später kehrte auch der national weniger bekannte Zürcher Daniel Frei den Genossen den Rücken und wurde Mitglied der GLP.Die «Ehe für Alle»: Die Grünliberalen waren es, die mit einer parlamentarischen Initiative «Die Ehe für Alle» auch in der Schweiz angestossen haben. Mit dem Thema rechtliche Gleichstellung kann die GLP beim urbanen, offenen, hippen Wählersegment punkten.', 
             'Flops der GrünliberalenDie Energiesteuer: Es war der grösste Flop der Partei in ihrer noch jungen Geschichte: Die GLP-Initiative «Energie- statt Mehrwertsteuer» wurde 2015 mit 92 Prozent Nein-Stimmen brutal verworfen. Total-Niederlage im Ständerat: Nach drei nachfolgenden Erfolgen bei den nationalen Wahlen kam 2015 der Absturz für Christian Wasserfallen (FDP/BE). Die GLP büsste fünf ihrer zwölf Nationalratssitze ein, im Ständerat ist sie gar nicht mehr vertreten. Den Sitz von Verena Diener konnte die Partei nicht halten.Niederlagen bei Finanzvorlagen: Bei Steuer- und AHV-Fragen scheint die Partei am Volk vorbei zu politisieren. So waren die Grünliberalen für die Unternehmenssteuerreform III, das Volk dagegen. Dafür sagte das Stimmvolk Ja zur STAF-Vorlage, welche die Steuerreform mit der AHV-Finanzierung verknüpfte. Die GLP war strikte dagegen.', 
             'Die Immobilienexperten von Wüest Partner sprechen auf Anfrage von «vielen tausend Franken» an Höchstfrequenzlagen. Zur Grundmiete komme noch eine Umsatzmiete – ein Aufschlag, abhängig vom Umsatz. Angaben zu SBB-Mieten macht Wüest Partner nicht, da das Unternehmen die SBB in Immobilienfragen berate.', 
             'Ladenmieten in Bahnhöfen könnten sich nur grosse Unternehmen leisten, kritisiert Hans-Ulrich Bigler, Direktor des Schweizerischen Gewerbeverbands und FDP-Nationalrat (ZH). Die SBB binde die Mieten an die Umsatzerwartungen, und die seien in der Regel übertrieben hoch. «KMU haben gar keine Chance, an diesen interessanten Lagen ihr Geschäft aufzumachen.»', 
             'Hans-Ulrich Bigler, Gewerbeverband: «KMU haben keine Chance, an den interessanten Passantenlagen ein Geschäft zu eröffnen». Aus ECO vom 25.02.2019.', 
             'Die SBB wehrt sich gegen den Vorwurf der Gewinnmaximierung. Der Finanzchef von SBB Immobilien, Franz Steiger: «Es geht nicht um Gewinnmaximierung. Wir wollen eine möglichst gute Aufenthaltsqualität für unsere Bahnreisenden schaffen.» Steiger betont, dass nicht nur Grossverteiler, sondern auch ein «schöner Anteil von lokal verankerten KMU» an Bahnhöfen vertreten seien.', 
             'Grüne und Linke fordern, dass Schweizer Bauern ihren Nutztierbestand um einen Viertel reduzieren.', 'Die Bevölkerung soll weniger Fleisch und vermehrt pflanzenbasiert essen.', 
             'Der Futterbedarf für die Fleischproduktion sei zu hoch und bedrohe den Regenwald, sagt Nationalrat Bastien Girod (Grüne/ZH).', 
             'Bauernvertreter sind verärgert. Fleisch werde immer mehr wie Zigaretten behandelt, sagt Nationalrat Mike Egger (SVP/SG).', 
             'Der Kampf gegen den Klimawandel erreicht unsere Esstische. «Der heutige Fleischkonsum ist nicht nachhaltig», kritisiert Nationalrat Bastien Girod von den Grünen. Deshalb fordern er und seine Partei einen raschen und tiefgreifenden Umbau der Landwirtschaft: Die Schweizer Bauern sollen ihren Tierbestand in nur zehn Jahren um einen Viertel reduzieren.', 
             '«Wir wollen keine Massentierhaltung und keine Futtermittelimporte mehr», erklärt der Umweltwissenschaftler und Nationalrat gegenüber der «Rundschau» die radikale Forderung. Der hohe Sojabedarf der industriellen Tiermast bedrohe die Regenwälder und das Methan aus dem Magen der Rinder sei ein besonders schädliches Treibhausgas.')

articles <- data.frame(article_id=1:length(artText), text=artText, stringsAsFactors=FALSE)
str(articles)

# Construct a pattern that searches for all politicians
polit_pattern <- glue::glue_collapse(politicians, sep = "|")

# Use the pattern to match all names in the column "text"
articles <- articles %>%
    mutate(mentions = str_match_all(text, pattern=polit_pattern))

# Collapse all items of the column "text"
all_articles_in_one <- glue::glue_collapse(articles$text)

# Pass the vector politicians to count all its elements
str_count(all_articles_in_one, pattern=politicians)


# Familiarize yourself with users by printing its contents
print(usersVec)

advanced_pattern <- glue::glue_collapse(c(
  # Match one or more alphabetical letters
  "username" = "^[A-Za-z]+",
  ": ",
  # Match one or more digit
  "logins" = "\\d+",
  ", ",
  # Match one or more arbitrary characters
  "email" = ".+$"
))

str_view(usersVec, advanced_pattern)

Chapter 3 - Extracting Structured Data From Text

Capturing Groups:

  • The capturing group is noted by ()
    • Everything in parenthese is pulled separately, so there is a full match and then a match for each capture group
    • Can instead use \1 for capture group 1, \2 for capture group 2, etc. - useful for str_replace

Tidyr Extract:

  • Can use extract(data=, col=, into=, regex="([[:alnum:]]+)’, remove=TRUE, convert=FALSE, …) # convert=TRUE will make educated guesses about column data types
    • The into= are the new column names, with the regex being the capture groups
    • The remove=FALSE would keep the original data in the col= column

Extracting Matches and Surrounding from Text:

  • Can define a word to be one or more characters plus a space
    • (\w+\s) will capture a single word
    • (\w+\s){0, 10} will capture zero to ten words
    • Can create custom patterns such as [\w[:punct:]]+ to include punctuation as part of the word string

Example code includes:

top_10 <- c("1. Karate Kid\n2. The Twilight Saga: Eclispe\n3. Knight & Day\n4. Shrek Forever After 3D\n5. Marmaduke.\n6. Street Dance\n7. Predators\n8. StreetDance 3D\n9. Robin Hood\n10. Micmacs A Tire-Larigot")

# Split the input by line break and enable simplify
top_10_lines <- str_split(top_10, pattern = "\\n", simplify = TRUE)

# Inspect the first three lines and analyze their form
top_10_lines[1:3]

# Add to the pattern two capturing groups that match rank and title
str_match(top_10_lines, pattern = "(\\d+)\\. (.+)")


# Remove a space followed by "3D" at the end of the line
str_replace(top_10_lines, pattern = " 3D", replacement = "")

# Use backreferences 2 and 1 to create a new sentence
str_replace(top_10_lines, pattern = "(\\d+)\\. (.*)", replacement = "\\2 is at rank \\1")


sLine <- c('Movie Title                             Distributor   Screens', 
           'Karate Kid                              WDSMP         58', 
           'Twilight Saga, The: Eclispe             Elite         91', 
           'Knight & Day                            Fox           50', 
           'Shrek Forever After (3D)                Universal     63', 
           'Marmaduke                               Fox           33', 
           'Predators                               Fox           26', 
           'StreetDance (3D)                        Rialto        11', 
           'Robin Hood                              Universal     9', 
           'Micmacs A Tire-Larigot                  Pathé         4', 
           'Sex And the City 2                      WB            12', 
           'Inception                               WB            24', 
           'Toy Story 3 In Disney Digital 3D        WDSMP         25', 
           'Shrek Forever After (3D)                Universal     22', 
           'Twilight Saga, The: Eclispe             Elite         27', 
           'Predators                               Fox           9', 
           'Italien, Le                             Pathé         6', 
           'Tournee                                 Agora         5', 
           'A-Team, The                             Fox           5', 
           'El Secreto De Sus Ojos                  Xenix         3', 
           'Kiss & Kill                             Frenetic      4', 
           'Toy Story 3 In Disney Digital 3D        WDSMP         5', 
           'Twilight Saga, The: Eclispe             Elite         4', 
           'Predators                               Fox           4', 
           'Road, The                               Elite         1', 
           'Robin Hood                              Universal     1', 
           'Cosa Voglio Di Piu                      Filmcoopi     1', 
           'Prince Of Persia: The Sands Of Time     WDSMP         1', 
           'Saw 6                                   Elite         1'
           )

screens_per_movie <- data.frame(file_source=rep(c("02_11_1", "02_11_2"), times=c(9, 20)), line=sLine,
                                stringsAsFactors=FALSE
                                )
screens_per_movie


extract(
    screens_per_movie,
    line,
    into = c("is_3d", "screens"),
    # Capture two groups: "3D" and "one or more digits"
    regex = "(3D).*?(\\d+)$",
    # Pass TRUE or FALSE, the original column should not be removed
    remove = FALSE,
    # Pass TRUE or FALSE, the result should get converted to numbers
    convert = TRUE
)


# Print the first three lines of screens_per_movie
screens_per_movie[1:3, ]

# Match anything, one or more word chars and one or more digits
str_match(
  screens_per_movie[3, ]$line,
  "(.*)\\s{2,}(\\w+)\\s{2,}(\\d+)"
)

# Extract the column line into title, distributor, screens
extract(
  screens_per_movie,
  col = line,
  into = c("title", "distributor", "screens"),
  regex = "(.*)\\s{2,}(\\w+)\\s{2,}(\\d+)"
  )


# Create our polit_pattern again by collapsing "politicians"
polit_pattern <- glue::glue_collapse(politicians, sep = "|")

# Match one or more word characters or punctuations
context <- "([\\w[:punct:]]+\\s){0,10}"

# Add this pattern in front and after the polit_pattern
polit_pattern_with_context <- glue::glue("{context}({polit_pattern})\\s?{context}")

str_extract_all(articles$text, pattern = polit_pattern_with_context)

Chapter 4 - Similarities Between Strings

Understanding String Distances:

  • String distances indicate how different two strings are from each other
  • The Levenshtein edit distance is a common calculation - total number of additions and deletions and substitutions needed
    • A substitution counts as only a single point, so run is only 2 from rain; run -> ran -> rain
    • Can be helpful for finding the best match to a typo in the input data
    • Can be helpful for auto-correct of spelling errors
  • The package stringdist in R offers many opportunities for calculating string distances
    • stringdist::stringdist(a, b, method = “lv”) will run the edit distance between a and b
    • stringdist::amatch(x=, table=, maxDist=1, method=“lv”) will pull any matches of x to table that are within maxDist; return is the first match position of x to table

Methods of String Distances:

  • The Damerau-Levenshtein distance allows for transcription to count as a single change alo
    • “read” and “raed” would be considered to have a single distance between them - 1 transcription
    • This is important for human-typed data, but not for machine-produced data
    • method=“dl”
  • Can also calculate the Optimal String Alignment (OSA) distance using method=“osa”
  • The Q-gram (or n-gram) is an overlapping substring of a given length
    • “read” (q=2) would become ‘re’, ‘ea’, ‘ad’
    • qgrams(a, b, q=) will find the total number of mismatched q-grams of length q between a and b
  • Can implement q-grams inside stringdist::stringdist with method=“qgram”
    • method=“jaccard” takes non-shared divided by total unique
    • method=“cosine” finds the angles between the vectors by assuming an n-dimensional space
  • The smaller the number, the higher the similarity between strings

Fuzzy Joins:

  • May want to join data, allowing for small differences in name, user ID, and the like that may have some mismatches
  • Fuzzy joins allow for slight differences during merges - available in library fuzzyjoin
    • fuzzyjoin::stringdist_join(a, b, by=, method=“lv”, max_dist=1, distance_col=“distance”)

Custom Fuzzy Matching:

  • Can combine multiple fuzzy matches in a single process - title based on distance, years based on absolute value of differences
  • Can use helper functions for the join
    • small_str_distance <- function(left, right) { stringdist(left, right) <= 5}
    • close_to_each_other <- function(left, right) { abs(left-right) <= 3 }
    • fuzzy_left_join(a, b, by=c(“title”=“prod_title”, “year”=“prod_year”), match_fun=c(“title”=small_str_distance, “year”=close_to_each_other))

Wrap Up:

  • Regular expressions for matching to a large set of text
  • Creating strings with data - glue, glue_collapse
  • Extracting structured text from data
  • Similarities between strings

Example code includes:

usernames <- c("Max Power", "Emilie Brown", "Max Mustermann")

# Search usernames with a maximum edit distance of 1
closest_index <- stringdist::amatch(x = "Emile Brown", table = usernames, maxDist = 1, method = "lv")

# Print the matched name in usernames at closest_index
print(glue::glue("Did you mean {name_matched}?", name_matched = usernames[closest_index]))


search <- "Mariah Carey"
names <- c("M. Carey", "Mick Jagger", "Michael Jackson")

# Pass the values 1 and 2 as "q" and inspect the qgrams
stringdist::qgrams("Mariah Carey", "M. Carey", q = 1)
stringdist::qgrams("Mariah Carey", "M. Carey", q = 2)

# Try the qgram method on the variables search and names
stringdist::stringdist(search, names, method = "qgram", q = 1)
stringdist::stringdist(search, names, method = "qgram", q = 2)

# Try the default method (osa) on the same input and compare
stringdist::stringdist(search, names, method = "osa")


UIuser_input <- c('Hussein Perry', 'Agata Kit', 'Ayoub', 'Rodrigues Partridge', 'Haiden Cambpell', 'Harpret Pennington', 'Malakai Coles', 'Lola-Rose Houston', 'Efreim anderson', 'Hugh Aston', 'Eleanor Hussein', 'Melodye Doherty', 'Avneet Simonds', 'Ayush Reed', 'Emilie Robrts', 'Emet Vo', 'Koby Emery', 'Latoya Weber', 'Kira Dugan', 'Cunningham Jan', 'Conar Small', 'Rivka Ferraira Lopez', 'Eliot Buckanan', 'Ioussef Austin', 'Kai Hyas', 'Anwen Firth Meyer', 'FardeenRatliff', 'Roscoe Grifith', 'Lillie Mai Bannister', 'A. Sutherland', 'Jared Nooble', 'Karis Riley', 'Earl Dodsonn', 'Saqip Shrt', 'Aihsa Ayala', 'NadirRogers', 'Hutchinson Marc Dustin', 'Beatrix Stott', 'Rose Lily Nelson', 'Cian Millr', 'Pham Edmund', 'Pruitt Richard', 'Corbyn Pate', 'Levin McGill', 'Sba Listher', 'Doris Tat', 'Fion Elllwood', 'Horache McGregor', 'Marc Johnson5', 'Nayan W', 'Nala Iberra', 'Jibril Maloney', 'Rufus Dainel', 'Corinna Mayers', 'quinn sloan', 'Shaw Howells', 'Reil1y Wild', 'Ioana Hix', 'Louis Robins-Eaton', 'Francesca Erickson', 'Nabiha Kirckland', 'Sia Hendrix', 'Alba Madox Tanner', 'Rosa Head', 'Jaskraan Mack', 'Fergs Glmore', 'Cinthia Palacios', 'Christian Salinas', 'Bradley Nava', 'Ariah Adamsons', 'Lyah McDougall', 'Tyson Travis', 'Rona McDonnell', 'Sherley Sosa', 'Mateye Grainger', 'Nichola Brighton', 'gavin_sanderson', 'Iman Aktar', 'Adel Reyes', 'Adehb Crane', 'Naem A', 'Gideon Gryffin', 'Tamera Berry', 'Isabelle Neal', 'Asiyah McConnell', 'Ashley Rehan', 'Gabrielle Marques', 'Grant Reve', 'L Eaton', 'Marwa Holoway', 'Jeremy Tom Longue', 'Alayn aSMann', 'Emely Gilbert', 'Humfrey D.', 'Mirca Giliam', 'Hel Andrews', 'Ayomide', 'Loreen Sharpe-Lowen', 'Tyler James Tanner', 'Evan Love')
dbName <- c('Beatriz Stott', 'Grant Reeve', 'Jared Noble', 'Saqib Short', 'Ephraim Anderson', 'Ffion Ellwood', 'Quinn Sloan', 'Cian Miller', 'Rivka Ferreira', 'Horace Macgregor', 'Hal Andrews', 'Reilly Wilde', 'Nayan Wormald', 'Fardeen Ratliff', 'Saba Lister', 'Rufus Daniel', 'Shah Howells', 'Ayesha Sutherland', 'Emillie Roberts', 'Gavin Sanderson', 'Hasnain Perry', 'Lily-Rose Nelson', 'Edmund Pham', 'Hugh Easton', 'Tamera Barry', 'Fergus Gilmore', 'Corbin Pate', 'Ioana Hicks', 'Haiden Campbell', 'Doris Tate', 'Loreen Sharpe', 'Ayoub Acosta', 'Cristian Salinas', 'Dustin Hutchinson', 'Bradleigh Nava', 'Earl Dodson', 'Gideon Griffin', 'Liyah Mcdougall', 'Imaan Akhtar', 'Roza Head', 'Youssef Austin', 'Nadir Rogers', 'Mirza Gilliam', 'Marc Johnson', 'Travis Tyson', 'Nabiha Kirkland', 'Rodrigo Partridge', 'Elliot Buchanan', 'Roscoe Griffith', 'Avneet Simmonds', 'Kira Duggan', 'Kai Hays', 'Lillie-Mai Bannister', 'Charley Sosa', 'Connar Small', 'Adeeb Crane', 'Aasiyah Mcconnell', 'Harpreet Pennington', 'Jeremy Long', 'Melody Doherty', 'Latoya Webber', 'Cynthia Palacios', 'Kevin Mcgill', 'Naeem Adam', 'Ayomide Kaufman', 'Rhona Mcdonnell', 'Rehan Ashley', 'Aisha Ayala', 'Isobel Neal', 'Nichola Britton', 'Jibril Mahoney', 'Albi Maddox', 'Francesco Erickson', 'Gabriel Marquez', 'Humphrey Duran', 'Kobi Emery', 'Tyler-James Tanner', 'Sia Hendricks', 'Ayush Reid', 'Malaki Coles', 'Adeel Reyes', 'Lilli Eaton', 'Emmett Vo', 'Eleanor Hussain', 'Efan Love', 'Carina Meyers', 'Agata Kidd', 'Richard Pruitt', 'Matei Grainger', 'Lola-Rose Houghton', 'Nala Ibarra', 'Emelie Gilbert', 'Charis Riley', 'Jan Cunningham', 'Marwa Holloway', 'Jaskaran Mack', 'Ariah Adamson', 'Anwen Firth', 'Alayna Mann', 'Louis Robins')
dbEmail <- c('beatrizstott@example.com', 'grant-reeve@example.com', 'jared.noble@example.com', 'saqib_short@example.com', 'ephraim-anderson@example.com', 'ffion.ellwood@example.com', 'quinn_sloan@example.com', 'cianmiller@example.com', 'rivkaferreira2@example.com', 'horace.macgregor@example.com', 'hal.andrews@example.com', 'reilly_wilde@example.com', 'nayanwormald@example.com', 'fardeen.ratliff@example.com', 'saba-lister@example.com', 'rufus_daniel@example.com', 'showells@example.com', 'ayeshasutherland@example.com', 'emillie.r@example.com', 'gavin_sanderson@example.com', 'hasnainperry@example.com', 'lily.r.nelson@example.com', 'edmund_pham@example.com', 'hugh.easton@example.com', 'tamerabarry@example.com', 'fergusg2@example.com', 'cpate@example.com', 'ihicks@example.com', 'haiden.campbell@example.com', 'doris_tate@example.com', 'loreen.sharpe@example.com', 'ayoub-acosta@example.com', 'cristian-salinas@example.com', 'dustin-hutchinson@example.com', 'bradleigh-nava@example.com', 'earl-dodson@example.com', 'gideon-griffin@example.com', 'liyah-mcdougall@example.com', 'imaan-akhtar@example.com', 'roz@example.com', 'youssef-austin@example.com', 'nadir.rogers@example.com', 'mirza.g@example.com', 'marc.johnson@example.com', 'travis-tyson@example.com', 'nabihakirkland@example.com', 'rodrigo.partridge@example.com', 'elliot.buchanan@example.com', 'roscoe.griffith@example.com', 'avneet-simmonds@example.com', 'kira-duggan@example.com', 'kai-hays@example.com', 'lillie-mai-bannister@example.com', 'c-sosa@example.com', 'connarsmall@example.com', 'adeeb.crane@example.com', 'aasiyah-mcconnell@example.com', 'harpreet-pennington@example.com', 'jeremy-long@example.com', 'melody.doherty@example.com', 'latoya.webber@example.com', 'cynthiapalacios@example.com', 'kevinmcgill@example.com', 'naeem.a@example.com', 'ayomide-kaufman@example.com', 'rhonamcdonnell@example.com', 'rehan.a@example.com', 'aisha-ayala@example.com', 'isobel-neal@example.com', 'nichola.britton@example.com', 'jibril-mahoney@example.com', 'albi.m@example.com', 'francescoerickson@example.com', 'gabriel.m@example.com', 'humphrey-duran@example.com', 'kobi-emery@example.com', 'tyler.j.tanner@example.com', 'sia-hendricks@example.com', 'ayush_reid@example.com', 'malaki-coles@example.com', 'adeel_reyes@example.com', 'lilli-eaton@example.com', 'emmett-vo@example.com', 'eleanor-hussain@example.com', 'efan-love@example.com', 'carina-meyers@example.com', 'agata-kidd@example.com', 'richard-pruitt@example.com', 'matei_grainger@example.com', 'lola-rose_houghton@example.com', 'nalaibarra@example.com', 'emelie-gilbert@example.com', 'charisriley@example.com', 'jancunningham@example.com', 'marwa-holloway@example.com', 'jaskaran-mack@example.com', 'ariah-adamson@example.com', 'anwenfirth@example.com', 'alaynamann@example.com', 'louis-robins@example.com')
user_input <- tibble::tibble(user_input=UIuser_input)
database <- tibble::tibble(name=dbName, email=dbEmail)


# Join the data frames on a maximum string distance of 2
joined <- fuzzyjoin::stringdist_join(
    user_input,
    database,
    by = c("user_input" = "name"),
    max_dist = 3,
    distance_col = "distance",
    ignore_case = TRUE
)

# Print the number of rows of the newly created data frame
print(glue::glue("{n} out of 100 names were matched successfully", n = nrow(joined)))


movie_titles <- tibble::tibble(title=c("mama", "ma loute", "ma vie de gourgette", "maggies plan", "magnus", "manifesto", "maps to thes tars", "maud1e", "mehr ais liebe", "mercenaire"), year=2014+c(0, 2, 2, 2, 1, 0, 0, 2, 1, 2))

movie_db <- tibble::tibble(title=c('m.s. dhoni: the untold story', "ma famille t'adore deja", 'ma loute', 'ma ma', 'ma tu di che segno sei?', 'ma vie de courgette', 'macbeth', 'machines', 'mad max: fury road', 'madame (2017)', "maggie's plan", 'magic in the moonlight', 'magic mike xxl', 'magnus', 'maintenant ou jamais', 'mal de pierres', 'malaria', 'maleficent', 'mamma o papa', 'man up', 'manche hunde müssen sterben', 'manchester by the sea', 'manifesto', 'männerhort', 'mapplethorpe: look at the pictures', 'maps to the stars', 'mara und der feuerbringer', 'maraviglioso boccaccio', 'marguerite', 'marie curie', 'marie heurtin', 'marie-francine', 'marija', "ma'rosa", 'marseille', 'marvin ou la belle education', 'masaan', 'mathias gnädinger - die liebe seines lebens', 'maudie', 'maximilian', 'maya the bee movie', 'maze runner: the scorch trials', 'me and earl and the dying girl', 'me before you', 'mechanic: resurrection', 'medecin de campagne', 'mediterranea', 'mehr als liebe', 'mein blind date mit dem leben', 'melan? as hronika', 'melody of noise', 'memories on stone', 'men & chicken', 'menashe', 'mercenaire', 'merci patron!', 'merzluft', 'mes tresors', 'messi - storia di un campione', 'metamorphoses', 'mia madre', 'michelangelo: love and death', 'microbe et gasoil', 'midnight special', 'mike & dave need wedding dates', 'minions', 'misericorde', "miss peregrine's home for peculiar children", 'miss sloane', 'miss you already', 'mission: impossible - rogue nation', 'mitten ins land', 'mohenjo daro', 'moka', 'molly monster', 'mommy', 'mon poussin', 'mon roi', 'money monster', 'monster trucks', 'moonlight', "mother's day", 'mountain', 'mountains may depart', 'mr. gaga', 'mr. holmes', 'mr. kaplan', 'mr. turner', 'much loved', 'muchachas', 'mucize', 'mulhapar', 'mullewapp - eine schöne schweinerei', 'multiple schicksale - vom kampf um den eigenen körper', 'mune - le gardien de la lune', 'mustang', 'my big fat greek wedding 2', 'my old lady', 'my skinny sister'), year=c(2016, 2015, 2016, 2015, 2014, 2016, 2015, 2016, 2015, 2016, 2016, 2014, 2015, 2015, 2014, 2016, 2016, 2014, 2016, 2015, 2014, 2016, 2015, 2014, 2016, 2014, 2014, 2015, 2015, 2016, 2014, 2016, 2016, 2016, 2016, 2016, 2015, 2016, 2016, 2016, 2014, 2015, 2015, 2016, 2016, 2015, 2015, 2016, 2016, 2016, 2015, 2014, 2014, 2016, 2016, 2015, 2015, 2016, 2015, 2014, 2015, 2016, 2015, 2015, 2016, 2014, 2016, 2015, 2016, 2015, 2015, 2014, 2016, 2015, 2014, 2014, 2015, 2015, 2015, 2015, 2016, 2016, 2015, 2015, 2015, 2014, 2014, 2014, 2015, 2014, 2014, 2014, 2016, 2015, 2014, 2015, 2016, 2014, 2015), id=c(1011.563, 1011.242, 1011.129, 1010.849, 1010.542, 1011.209, 1010.688, 1012.275, 1009.914, 1011.785, 1011.1, 1010.145, 1010.211, 1011.612, 1010.379, 1011.308, 1012.409, 1009.536, 1011.827, 1010.812, 1010.454, 1011.294, 1012.107, 1010.155, 1011.427, 1010.056, 1010.156, 1011.127, 1010.763, 1011.609, 1010.223, 1011.654, 1011.469, 1011.617, 1011.107, 1012.155, 1010.7, 1011.222, 1011.353, 1012.108, 1009.999, 1010.443, 1010.694, 1010.819, 1010.625, 1011.137, 1010.912, 1011.87, 1011.406, 1012.914, 1011.15, 1010.471, 1010.347, 1012.231, 1011.688, 1011.352, 1010.654, 1011.397, 1010.833, 1010.621, 1010.68, 1012.294, 1010.803, 1010.234, 1010.595, 1009.253, 1011.673, 1009.71, 1011.564, 1011.055, 1009.907, 1010.129, 1011.494, 1011.36, 1010.841, 1010.289, 1011.667, 1010.604, 1011.206, 1009.753, 1011.754, 1010.95, 1011.278, 1010.887, 1011.426, 1010.627, 1010.523, 1010.256, 1011.065, 1010.58, 1010.452, 1010.426, 1011.354, 1010.939, 1010.56, 1010.94, 1010.894, 1010.275, 1011.026))


is_string_distance_below_three <- function(left, right) {
    stringdist::stringdist(left, right) < 3
}

is_closer_than_three_years <- function(left, right) {
    abs(left - right) < 3
}

# Join by "title" and "year" with our two helper functions
fuzzyjoin::fuzzy_left_join(
    movie_titles, movie_db,
    by = c("title", "year"),
    match_fun = c("title" = is_string_distance_below_three, "year" = is_closer_than_three_years)
)

Data Cleaning in R

Chapter 1 - Common Data Problems

Data Type Constraints:

  • Data errors can occur for many reasons, and need to be addressed early in the workflow
  • Can check for numeric data in multuple ways
    • glimpse()
    • is.numeric()
    • assertive::assert_is_numeric()
  • There is an assert_is_() and is. for all key data types
  • Can use stringr::str_remove(string, remove) to remove the ‘remove’ from string

Range Constraints:

  • Many variables have an expected range that all observations should fall within
    • Sometimes, the range is bounded only on one side
    • assertive::assert_all_are_in_closed_range(num, lower=, upper=)
  • Can use dplyr::replace() to replace values
    • dplyr::replace(column, condition, replacement_if_condition_met)
  • Can check dates using assertive::assert_all_are_in_past()

Uniqueness Constraints:

  • Duplicates can be full (everything in the row) or partial (some elements in the row)
    • Running duplicated() on a frame will return a boolean vector for whether each row is a duplicate
    • The dplyr::distinct() will convert the data frame to being unique (each duplicate kept only once)
  • Partial duplicates can be found using count() and filtering for those with >= 2
    • Setting .keep_all=TRUE in the dplyr::distinct() function will remove extra duplicates

Example code includes:

bike_share_rides <- readRDS("./RInputFiles/bike_share_rides_ch1_1.rds")

# Glimpse at bike_share_rides
glimpse(bike_share_rides)

# Summary of user_birth_year
summary(bike_share_rides$user_birth_year)


# Convert user_birth_year to factor: user_birth_year_fct
bike_share_rides <- bike_share_rides %>%
    mutate(user_birth_year_fct = factor(user_birth_year))

# Assert user_birth_year_fct is a factor
assertive::assert_is_factor(bike_share_rides$user_birth_year_fct)

# Summary of user_birth_year_fct
summary(bike_share_rides$user_birth_year_fct)


bike_share_rides <- bike_share_rides %>%
    # Remove 'minutes' from duration: duration_trimmed
    mutate(duration_trimmed = str_remove(duration, "minutes"),
           duration_mins = as.numeric(duration_trimmed)
           )

# Glimpse at bike_share_rides
glimpse(bike_share_rides)

# Assert duration_min is numeric
assertive::assert_is_numeric(bike_share_rides$duration_mins)

# Calculate mean duration
mean(bike_share_rides$duration_mins)


# Create breaks
breaks <- c(min(bike_share_rides$duration_mins), 0, 1440, max(bike_share_rides$duration_mins))

# Create a histogram of duration_min
ggplot(bike_share_rides, aes(duration_mins)) +
    geom_histogram(breaks = breaks)

# duration_min_const: replace values of duration_min>1440 with 1440
bike_share_rides <- bike_share_rides %>%
    mutate(duration_min_const = replace(duration_mins, duration_mins > 1440, 1440))

# Make sure all values are between 0 and 1440
assertive::assert_all_are_in_closed_range(bike_share_rides$duration_min_const, lower = 0, upper = 1440)


# Convert date to Date type
bike_share_rides <- bike_share_rides %>%
    mutate(date = as.Date(date))

# Make sure all dates are in the past
assertive::assert_all_are_in_past(bike_share_rides$date)

# Filter for rides that occurred before today
bike_share_rides_past <- bike_share_rides %>%
    filter(date < lubridate::today())

# Make sure all dates from bike_share_rides_past are in the past
assertive::assert_all_are_in_past(bike_share_rides_past$date)


# Count the number of full duplicates
sum(duplicated(bike_share_rides))

# Remove duplicates
bike_share_rides_unique <- bike_share_rides %>% 
    distinct()

# Count the full duplicates in bike_share_rides_unique
sum(duplicated(bike_share_rides_unique))


# Find duplicated ride_ids
bike_share_rides %>% 
    # Count the number of ocurrences of each ride_id
    count(ride_id) %>% 
    # Filter for rows with a count > 1
    filter(n > 1)

# Remove full and partial duplicates
bike_share_rides_unique <- bike_share_rides %>%
    # Only based on ride_id instead of all cols
    distinct(ride_id, .keep_all = TRUE)

# Find duplicated ride_ids in bike_share_rides_unique
bike_share_rides_unique %>%
    count(ride_id) %>%
    filter(n > 1)


bike_share_rides %>%
    # Group by ride_id and date
    group_by(ride_id, date) %>%
    # Add duration_min_avg column
    mutate(duration_min_avg = mean(duration_mins)) %>%
    # Remove duplicates based on ride_id and date, keep all cols
    distinct(ride_id, date, .keep_all=TRUE) %>%
    # Remove duration_min column
    select(-duration_mins)

Chapter 2 - Categorical and Text Data

Checking Membership:

  • Categorical variables can only take on a pre-set number of values - marriage status, income bucket, etc.
    • Factors are stored as numeric variables, with human-readable labels

Categorical Data Problems:

  • Sometimes, there are typos and capitalizations to fix and harmonize
    • stringr::str_to_lower()
    • stringr::str_to_upper()
    • stringr::str_trim()
  • Sometimes, small categories should be collapsed in to a single, larger category
    • forcats::fct_collapse(myFactor, other=) # everything in other= should be renamed to “other”

Cleaning Text Data:

  • Text data can have formatting inconsistencies
  • Text data can contain invalid entries - wrong length, wrong alphanumerics, etc.
    • str_detect(myData, myChar)
    • str_replace_all(myData, myChar, myReplace)
    • str_remove_all(myData, myCharToRemove)
    • str_length(myData)
  • Regular expressions can be used to help with text cleaning and processing
    • Can use fixed() to override the default behavior of treating everything as a regex in stringr

Example code includes:

sfo_survey <- readRDS("./RInputFiles/sfo_survey_ch2_1.rds")
glimpse(sfo_survey)


# Count the number of occurrences of dest_size
sfo_survey %>%
    count(dest_size)

dest_sizes <- data.frame(dest_size=c("Small", "Medium", "Large", "Hub"), 
                         passengers_per_day=factor(c("0-20K", "20K-70K", "70K-100K", "100K+")), 
                         stringsAsFactors=FALSE
                         )

# Find bad dest_size rows
sfo_survey %>% 
    # Use an anti-join with dest_sizes
    anti_join(dest_sizes) %>%
    # Select id, airline, destination, and dest_size cols
    select(id, airline, destination, dest_size)

# Remove bad dest_size rows
sfo_survey %>% 
    # Join with dest_sizes
    semi_join(dest_sizes) %>%
    # Count the number of each dest_size
    count(dest_size)


# Count cleanliness
sfo_survey %>%
    count(cleanliness)


# Add new columns to sfo_survey
sfo_survey <- sfo_survey %>%
    # dest_size_trimmed: dest_size without whitespace
    mutate(dest_size_trimmed = str_trim(dest_size),
           # cleanliness_lower: cleanliness converted to lowercase
           cleanliness_lower = str_to_lower(cleanliness))

# Count values of dest_size_trimmed
sfo_survey %>%
    count(dest_size_trimmed)

# Count values of cleanliness_lower
sfo_survey %>%
    count(cleanliness_lower)


# Count categories of dest_region
sfo_survey %>%
    count(dest_region)

# Categories to map to Europe
europe_categories <- c("EU", "Europ", "eur")

# Add a new col dest_region_collapsed
sfo_survey %>%
    # Map all categories in europe_categories to Europe
    mutate(dest_region_collapsed = fct_collapse(dest_region, Europe = europe_categories)) %>%
    # Count categories of dest_region_collapsed
    count(dest_region_collapsed)


phData <- c('858 990 5153', '731-813-2043', '563-732-6802', '145 725 4021', '931 311 5801', '(637) 782-6989', '172 990 3485', '872 325 4341', '(359) 803-9809', '152 790 8238', '330 561 9257', '437 420 7546', '495 632 4027', '(416) 788-2844', '311-305-4367', '817-400-0481', '430 723 1079', '(729) 609-4819', '(201) 737-4409', '(137) 611-3694', '226 490 8696', '123 570 8640', '665 803 2453', '(812) 869-6263', '639 132 6386', '(194) 198-0504', '437 886 0753', '626 756 5089', '(299) 137-6993', '714 950 3364', '653-786-5985', '518 286 5956', '194 960 2145', '362-136-1153', '376-456-0697', '657 832 1189', '962-918-6117', '692 929 3592', '805-877-3887', '(739) 710-2966', '819 732 4132', '367 221 9710', '361 154 1789', '680 488 1182', '928 638 1186', '588-693-9875', '681-308-7915', '783 647 8490', '897 847 0632', '150 952 4453', '322-884-3020', '176-313-5403', '487 109 4196', '(477) 182-4689', '544 382 8289', '781 543 7456', '911 829 6476', '(525) 362-5532', '517 986 3426', '838 220 5397', '(687) 887-6766', '179 163 0902', '539 137 8983', '733-154-0094', '639 881 3693', '291-830-3017', '(637) 100-0509', '750 520 0167', '676 485 8963', '135 566 5090', '337 260 4996', '371 185 2377', '280 461 1386', '(603) 149-7268', '(364) 792-5553', '496-429-1314', '459 671 4698', '486-268-3312', '497-518-4050', '(535) 685-8273', '859 495 4050', '(826) 738-8316', '724-134-3870', '(554) 269-8937', '125-578-4253', '614 800 2861', '487-232-4449', '(298) 135-0900', '(392) 183-7831', '(606) 596-1029', '(384) 953-4795', '(855) 811-8811', '253-374-7102', '(419) 295-9580', '802 102 8345', '417 393 0050', '787-624-8443', '(919) 486-4251', '341 824 5322', '415 551 1608', '(392) 495-7961', '473-238-3324', '(506) 760-3043', '876-834-0624', '919 611 6170', '(146) 699-3488', '261 434 7760', '617 310 2684', '182-535-3412', '506 129 1694', '(302) 339-0791', '(446) 229-4342', '(249) 602-6985', '(150) 905-6938', '313 990 8823', '656-941-5355', '(116) 689-6617', '955 324 5981', '175 808 2189', '(896) 993-8555', '105-687-6500', '757 524 2964', '201 374 2424', '(466) 912-8401', '766 112 6143', '783-463-4865', '853-803-9900', '(347) 851-5388', '992 114 6973', '316-212-7309', '301 672 1092', '795 137 0201', '381-883-5497', '100-531-4642', '994 923 6634', '920 355 8404', '(441) 445-6532', '325 795 2455', '593 829 6250', '566-482-9004', '542 537 6770', '716 191 1741', '491-727-7162', '167-336-5660', '358 831 0725', '432 979 7292', '(205) 382-5599', '(208) 794-9612', '728 662 3934', '380-918-8572', '(905) 742-3525', '151 434 6989', '755 544 2629', '633 181 4494', '346 706 5964', '688 690 2184', '618 717 1697', '185-321-6877')
phData <- c(phData, '(147) 535-3529', '(152) 912-4118', '726 943 7486', '(634) 521-4714', '670-248-0186', '(121) 509-7306', '(105) 635-5212', '(732) 168-0110', '364 834 3150', '176 508 2778', '120 941 0833', '670 902 3199', '(214) 250-8756', '(119) 975-8484', '297 484 3285', '(489) 534-6272', '(610) 716-5732', '(456) 925-4236', '743-103-7645', '432-281-3682', '(167) 144-9470', '648 685 6188', '548 191 4898', '288 110 9483', '946-558-5801', '388 744 9637', '506 463 9129', '(848) 149-5208', '(970) 908-2298', '(843) 120-5653', '306 394 8640', '170-641-3537', '860 723 5066', '814-895-6610', '(139) 727-9901', '598 735 8557', '593 895 6761', '(817) 824-3849', '508 484 9738', '719 489 4724', '503-671-4901', '275 649 8183', '968 130 7012', '290 367 6676', '499 766 9941', '538-393-2243', '(540) 362-7136', '(802) 910-1742', '845 544 4748', '784-458-8425', '(365) 217-0634', '708 500 2758', '(594) 797-7729', '982 555 9504', '477-307-3338', '744-301-1148', '389 484 8888', '739 303 6128', '175 905 9962', '(900) 462-1379', '878-636-2294', '(998) 692-1900', '994 421 8642', '304-225-5895', '931-522-5498', '838 898 2275', '718 187 7125', '943 561 8955', '341 261 6456', '(507) 483-3618', '826 266 0205', '(380) 449-7849', '589-975-0198', '657 680 8781', '501-668-7869', '224 365 8299', '747 588 1968', '(706) 836-7047', '(698) 462-6742', '748 446 1257', '525-552-4162', '635-714-8302', '114 668 2834', '404 788 2855', '673 292 2444', '136 788 1426', '564 677 3934', '213 981 7762', '453-556-0852', '423 593 4483', '820 609 7454', '131-641-1331', '824 540 9579', '(753) 726-0123', '739 737 9041', '609-332-7370', '426-182-1365', '(347) 782-5787', '898-210-6218', '938 394 0411', '904 844 1759', '658-861-4306', '(716) 184-1232', '380-105-1757', '(969) 555-0453', '249 452 4370', '443 384 8253', '711 289 2247', '343-973-0193', '367 650 3720', '(265) 286-5671', '148 630 8560', '413-727-2672', '(162) 332-5838', '(229) 604-7790', '(892) 301-0333', '148 501 5084', '564 780 8272', '524 190 0899', '463 792 2782', '(331) 472-8624', '522-286-5318', '310 719 4550', '314-360-4288', '(301) 534-5754', '822 271 5719', '(341) 473-0639', '(835) 882-3693', '465-550-6610', '388 100 1482', '(589) 194-0523', '982 842 4913', '671 913 4563', '144 468 5864', '194 344 4039', '331 747 5714', '221-190-1449', '(322) 843-0185', '676-614-9095', '(190) 975-2514', '(909) 382-3774', '956 257 9319', '362 145 8268', '570 727 3998', '225 964 9193', '227 419 9482', '949 360 7605', '(347) 896-3463', '794 939 9735', '413 754 3034', '806 730 0459', '791 195 8909', '691-318-3535', '852-386-6029')
phData <- c(phData, '380 682 7795', '(355) 550-1392', '247 586 4579', '235 257 1041', '(705) 456-1905', '106 756 2785', '(836) 207-8419', '(306) 552-1875', '729-102-7511', '795 583 0958', '700-431-3918', '929 632 1068', '763 906 2495', '469 976 6796', '887-657-4143', '574-438-5329', '319 127 9518', '429 960 9710', '419 646 0299', '192 343 8515', '521 336 8581', '776 367 6109', '470 367 1392', '(944) 189-7555', '998-931-4783', '362-178-6307', '(458) 404-9558', '212 286 7936', '(481) 522-1039', '(376) 611-4588', '936 193 9690', '(641) 544-6549', '797-870-7818', '693 907 5353', '332 973 4943', '929 622 9077', '649-379-5361', '(572) 748-6932', '395-892-5646', '221-628-9561', '(227) 801-6148', '549-649-1864', '342 941 0439', '701 390 9814', '(519) 573-6576', '919-342-0230', '364-759-2705', '949 543 7906', '942 732 6403', '(900) 586-1787', '308-607-9855', '764 645 5740', '472-337-8838', '791 847 7278', '128 805 3828', '365-832-0674', '123-282-3494', '285 424 4318', '452 352 1387', '129-377-8159', '222 143 3131', '162-451-0594', '(239) 325-5321', '436-422-6171', '605 284 4260', '929-102-5905', '847 507 8268', '452-811-8088', '799 143 1677', '196 756 4555', '119-444-0817', '885 454 0883', '945-998-0444', '(367) 897-7969', '163 241 9321', '594-176-5811', '(621) 874-9973', '332 963 4103', '389 318 3975', '894-593-7953', '561-266-7842', '354-958-8052', '151 921 2775', '901 140 3759', '(426) 342-7378', '333-520-4811', '(765) 191-1797', '850 914 9348', '246 272 9019', '(400) 250-0871', '504 419 9191', '(434) 725-0561', '231 863 7554', '(969) 207-3261', '939 253 9048', '879-154-4494', '577 786 2546', '(994) 688-3259', '841-717-4447', '397-353-6309', '558 191 4548', '905 768 2297', '348 522 2051', '307-323-6861', '663 886 2487', '274 944 6097', '(494) 308-3048', '328 307 0875', '673 524 3504', '934 721 0615', '102-957-6486', '(715) 288-8832', '794 925 8846', '637 281 4111', '(594) 448-6242', '335 802 2651', '560 699 9908', '451 163 0102', '613 800 0835', '370 453 5800', '192 507 5411', '182-227-4838', '647 126 2332', '606 125 6957', '728-404-5558', '506 812 6052', '427-665-3475', '808 739 7162', '375-978-3305', '181-708-2089', '(802) 810-5574', '242-540-4234', '105 116 9695', '(867) 891-0871', '945 144 7892', '531 895 6695', '(533) 213-4368', '894 810 2674', '943 812 6349', '476 168 4235', '931 385 6757', '(324) 188-6781')
phData <- c(phData, '100 378 8095', '0244-5', '397-362-5469', '(102) 928-7959', '(439) 568-6611', '(767) 205-0604', '(890) 548-9219', '938 982 5585', '769-472-2992', '190 204 1154', '(649) 925-8489', '321 616 1013', '(156) 837-4491', '178-232-0815', '(882) 304-9032', '828 549 6666', '280 544 4554', '183 208 5054', '(971) 548-6611', '(828) 153-5819', '203 448 1522', '900-871-9056', '406-167-1379', '(906) 850-9192', '(859) 777-8245', '641-635-8466', '807-671-6158', '(589) 270-7518', '768 529 8051', '220 660 0306', '928-179-7556', '153 756 0278', '273 829 9197', '269 463 0911', '696 984 8826', '905-903-5258', '267 332 4709', '(146) 129-5118', '(927) 747-9822', '534 216 6666', '(481) 479-7013', '971 175 2968', '(716) 777-3762', '274-863-3205', '(217) 589-0596', '928 445 5474', '858 990 5153', '731-813-2043', '563-732-6802', '145 725 4021', '931 311 5801', '(637) 782-6989', '172 990 3485', '872 325 4341', '(359) 803-9809', '152 790 8238', '330 561 9257', '437 420 7546', '495 632 4027', '(416) 788-2844', '311-305-4367', '817-400-0481', '430 723 1079', '(729) 609-4819', '(201) 737-4409', '(137) 611-3694', '226 490 8696', '123 570 8640', '665 803 2453', '(812) 869-6263', '639 132 6386', '(194) 198-0504', '437 886 0753', '626 756 5089', '(299) 137-6993', '714 950 3364', '653-786-5985', '518 286 5956', '194 960 2145', '362-136-1153', '376-456-0697', '657 832 1189', '962-918-6117', '692 929 3592', '805-877-3887', '(739) 710-2966', '819 732 4132', '367 221 9710', '361 154 1789', '680 488 1182', '928 638 1186', '588-693-9875', '681-308-7915', '783 647 8490', '897 847 0632', '150 952 4453', '322-884-3020', '176-313-5403', '487 109 4196', '(477) 182-4689', '544 382 8289', '781 543 7456', '911 829 6476', '(525) 362-5532', '517 986 3426', '838 220 5397', '(687) 887-6766', '179 163 0902', '539 137 8983', '733-154-0094', '639 881 3693', '291-830-3017', '(637) 100-0509', '750 520 0167', '676 485 8963', '135 566 5090', '337 260 4996', '371 185 2377', '280 461 1386', '(603) 149-7268', '(364) 792-5553', '496-429-1314', '459 671 4698', '486-268-3312', '497-518-4050', '(535) 685-8273', '859 495 4050', '(826) 738-8316', '724-134-3870', '(554) 269-8937', '125-578-4253', '614 800 2861', '487-232-4449', '(298) 135-0900', '(392) 183-7831', '(606) 596-1029', '(384) 953-4795', '(855) 811-8811', '253-374-7102', '(419) 295-9580', '802 102 8345', '417 393 0050', '787-624-8443', '(919) 486-4251', '341 824 5322', '415 551 1608', '(392) 495-7961', '473-238-3324', '(506) 760-3043', '876-834-0624', '919 611 6170', '(146) 699-3488', '261 434 7760', '617 310 2684', '182-535-3412', '506 129 1694', '(302) 339-0791', '(446) 229-4342', '(249) 602-6985', '(150) 905-6938', '313 990 8823', '656-941-5355', '(116) 689-6617', '955 324 5981', '175 808 2189', '(896) 993-8555', '105-687-6500', '757 524 2964', '201 374 2424', '(466) 912-8401', '766 112 6143', '783-463-4865', '853-803-9900', '(347) 851-5388', '992 114 6973', '316-212-7309', '301 672 1092', '795 137 0201', '381-883-5497', '100-531-4642', '994 923 6634', '920 355 8404', '(441) 445-6532', '325 795 2455', '593 829 6250', '566-482-9004', '542 537 6770', '716 191 1741', '491-727-7162', '167-336-5660', '358 831 0725', '432 979 7292', '(205) 382-5599', '(208) 794-9612', '728 662 3934', '380-918-8572', '(905) 742-3525', '151 434 6989', '755 544 2629', '633 181 4494', '346 706 5964', '688 690 2184', '618 717 1697', '185-321-6877')
phData <- c(phData, '(147) 535-3529', '(152) 912-4118', '726 943 7486', '(634) 521-4714', '670-248-0186', '(121) 509-7306', '(105) 635-5212', '(732) 168-0110', '364 834 3150', '176 508 2778', '120 941 0833', '670 902 3199', '(214) 250-8756', '(119) 975-8484', '297 484 3285', '(489) 534-6272', '(610) 716-5732', '(456) 925-4236', '743-103-7645', '432-281-3682', '(167) 144-9470', '648 685 6188', '548 191 4898', '288 110 9483', '946-558-5801', '388 744 9637', '506 463 9129', '(848) 149-5208', '(970) 908-2298', '(843) 120-5653', '306 394 8640', '170-641-3537', '860 723 5066', '814-895-6610', '(139) 727-9901', '598 735 8557', '593 895 6761', '(817) 824-3849', '508 484 9738', '719 489 4724', '503-671-4901', '275 649 8183', '968 130 7012', '290 367 6676', '499 766 9941', '538-393-2243', '(540) 362-7136', '(802) 910-1742', '845 544 4748', '784-458-8425', '(365) 217-0634', '708 500 2758', '(594) 797-7729', '982 555 9504', '477-307-3338', '744-301-1148', '389 484 8888', '739 303 6128', '175 905 9962', '(900) 462-1379', '878-636-2294', '(998) 692-1900', '994 421 8642', '304-225-5895', '931-522-5498', '838 898 2275', '718 187 7125', '943 561 8955', '341 261 6456', '(507) 483-3618', '826 266 0205', '(380) 449-7849', '589-975-0198', '657 680 8781', '501-668-7869', '224 365 8299', '747 588 1968', '(706) 836-7047', '(698) 462-6742', '748 446 1257', '525-552-4162', '635-714-8302', '114 668 2834', '404 788 2855', '673 292 2444', '136 788 1426', '564 677 3934', '213 981 7762', '453-556-0852', '423 593 4483', '820 609 7454', '131-641-1331', '824 540 9579', '(753) 726-0123', '739 737 9041', '609-332-7370', '426-182-1365', '(347) 782-5787', '898-210-6218', '938 394 0411', '904 844 1759', '658-861-4306', '(716) 184-1232', '380-105-1757', '(969) 555-0453', '249 452 4370', '443 384 8253', '711 289 2247', '343-973-0193', '367 650 3720', '(265) 286-5671', '148 630 8560', '413-727-2672', '(162) 332-5838', '(229) 604-7790', '(892) 301-0333', '148 501 5084', '564 780 8272', '524 190 0899', '463 792 2782', '(331) 472-8624', '522-286-5318', '310 719 4550', '314-360-4288', '(301) 534-5754', '822 271 5719', '(341) 473-0639', '(835) 882-3693', '465-550-6610', '388 100 1482', '(589) 194-0523', '982 842 4913', '671 913 4563', '144 468 5864', '194 344 4039', '331 747 5714', '221-190-1449', '(322) 843-0185', '676-614-9095', '(190) 975-2514', '(909) 382-3774', '956 257 9319', '362 145 8268', '570 727 3998', '225 964 9193', '227 419 9482', '949 360 7605', '(347) 896-3463', '794 939 9735', '413 754 3034', '806 730 0459', '791 195 8909', '691-318-3535', '852-386-6029', '380 682 7795', '(355) 550-1392', '247 586 4579', '235 257 1041', '(705) 456-1905', '106 756 2785', '(836) 207-8419', '(306) 552-1875', '729-102-7511', '795 583 0958', '700-431-3918', '929 632 1068', '763 906 2495', '469 976 6796', '887-657-4143', '574-438-5329', '319 127 9518', '429 960 9710', '419 646 0299', '192 343 8515', '521 336 8581', '776 367 6109', '470 367 1392', '(944) 189-7555', '998-931-4783', '362-178-6307', '(458) 404-9558', '212 286 7936', '(481) 522-1039', '(376) 611-4588', '936 193 9690', '(641) 544-6549', '797-870-7818', '693 907 5353', '332 973 4943', '929 622 9077', '649-379-5361', '(572) 748-6932')
phData <- c(phData, '395-892-5646', '221-628-9561', '(227) 801-6148', '549-649-1864', '342 941 0439', '701 390 9814', '(519) 573-6576', '919-342-0230', '364-759-2705', '949 543 7906', '942 732 6403', '(900) 586-1787', '308-607-9855', '764 645 5740', '472-337-8838', '791 847 7278', '128 805 3828', '365-832-0674', '123-282-3494', '285 424 4318', '452 352 1387', '129-377-8159', '222 143 3131', '162-451-0594', '(239) 325-5321', '436-422-6171', '605 284 4260', '929-102-5905', '847 507 8268', '452-811-8088', '799 143 1677', '196 756 4555', '119-444-0817', '885 454 0883', '945-998-0444', '(367) 897-7969', '163 241 9321', '594-176-5811', '(621) 874-9973', '332 963 4103', '389 318 3975', '894-593-7953', '561-266-7842', '354-958-8052', '151 921 2775', '901 140 3759', '(426) 342-7378', '333-520-4811', '(765) 191-1797', '850 914 9348', '246 272 9019', '(400) 250-0871', '504 419 9191', '(434) 725-0561', '231 863 7554', '(969) 207-3261', '939 253 9048', '879-154-4494', '577 786 2546', '(994) 688-3259', '841-717-4447', '397-353-6309', '558 191 4548', '905 768 2297', '348 522 2051', '307-323-6861', '663 886 2487', '274 944 6097', '(494) 308-3048', '328 307 0875', '673 524 3504', '934 721 0615', '102-957-6486', '(715) 288-8832', '925 8846', '637 281 4111', '(594) 448-6242', '335 802 2651', '560 699 9908', '451 163 0102', '613 800 0835', '370 453 5800', '192 507 5411', '182-227-4838', '647 126 2332', '606 125 6957', '728-404-5558', '506 812 6052', '427-665-3475', '808 739 7162', '375-978-3305', '181-708-2089', '(802) 810-5574', '242-540-4234', '105 116 9695', '(867) 891-0871', '945 144 7892', '531 895 6695', '(533) 213-4368', '894 810 2674', '943 812 6349', '476 168 4235', '931 385 6757', '(324) 188-6781', '100 378 8095', '459 572 0244', '397-362-5469', '(102) 928-7959', '(439) 568-6611', '(767) 205-0604', '(890) 548-9219', '938 982 5585', '769-472-2992', '190 204 1154', '(649) 925-8489', '321 616 1013', '(156) 837-4491', '178-232-0815', '(882) 304-9032', '828 549 6666', '280 544 4554', '183 208 5054', '(971) 548-6611', '(828) 153-5819', '203 448 1522', '900-871-9056', '406-167-1379', '(906) 850-9192', '(859) 777-8245', '641-635-8466', '807-671-6158', '(589) 270-7518', '768 529 8051', '220 660 0306', '928-179-7556', '153 756 0278', '273 829 9197', '269 463 0911', '696 984 8826', '905-903-5258')
phData <- c(phData, '267 332 4709', '(146) 129-5118', '(927) 747-9822', '534 216 6666', '(481) 479-7013', '971 175 2968', '(716) 777-3762', '274-863-3205', '(217) 589-0596', '928 445 5474', '858 990 5153', '731-813-2043', '563-732-6802', '145 725 4021', '931 311 5801', '(637) 782-6989', '172 990 3485', '872 325 4341', '(359) 803-9809', '152 790 8238', '330 561 9257', '437 420 7546', '495 632 4027', '(416) 788-2844', '311-305-4367', '817-400-0481', '430 723 1079', '(729) 609-4819', '(201) 737-4409', '(137) 611-3694', '226 490 8696', '123 570 8640', '665 803 2453', '(812) 869-6263', '639 132 6386', '(194) 198-0504', '437 886 0753', '626 756 5089', '(299) 137-6993', '714 950 3364', '653-786-5985', '518 286 5956', '194 960 2145', '362-136-1153', '376-456-0697', '657 832 1189', '962-918-6117', '692 929 3592', '805-877-3887', '(739) 710-2966', '819 732 4132', '367 221 9710', '361 154 1789', '680 488 1182', '928 638 1186', '588-693-9875', '681-308-7915', '783 647 8490', '897 847 0632', '150 952 4453', '322-884-3020', '176-313-5403', '487 109 4196', '(477) 182-4689', '544 382 8289', '781 543 7456', '911 829 6476', '(525) 362-5532', '517 986 3426', '838 220 5397', '(687) 887-6766', '179 163 0902', '539 137 8983', '733-154-0094', '639 881 3693', '291-830-3017', '(637) 100-0509', '750 520 0167', '676 485 8963', '135 566 5090', '337 260 4996', '371 185 2377', '280 461 1386', '(603) 149-7268', '(364) 792-5553', '496-429-1314', '459 671 4698', '486-268-3312', '497-518-4050', '(535) 685-8273', '859 495 4050', '(826) 738-8316', '724-134-3870', '(554) 269-8937', '125-578-4253', '614 800 2861', '487-232-4449', '(298) 135-0900', '(392) 183-7831', '(606) 596-1029', '(384) 953-4795', '(855) 811-8811', '253-374-7102', '(419) 295-9580', '802 102 8345', '417 393 0050', '787-624-8443', '(919) 486-4251', '341 824 5322', '415 551 1608', '(392) 495-7961', '473-238-3324', '(506) 760-3043', '876-834-0624', '919 611 6170', '(146) 699-3488', '261 434 7760', '617 310 2684', '182-535-3412', '506 129 1694', '(302) 339-0791', '(446) 229-4342', '(249) 602-6985', '(150) 905-6938', '313 990 8823', '656-941-5355', '(116) 689-6617', '955 324 5981', '175 808 2189', '(896) 993-8555', '105-687-6500', '757 524 2964', '201 374 2424', '(466) 912-8401', '766 112 6143', '783-463-4865', '853-803-9900', '(347) 851-5388')
phData <- c(phData, '992 114 6973', '316-212-7309', '301 672 1092', '795 137 0201', '381-883-5497', '100-531-4642', '994 923 6634', '920 355 8404', '(441) 445-6532', '325 795 2455', '593 829 6250', '566-482-9004', '542 537 6770', '716 191 1741', '491-727-7162', '167-336-5660', '358 831 0725', '432 979 7292', '(205) 382-5599', '(208) 794-9612', '728 662 3934', '380-918-8572', '(905) 742-3525', '151 434 6989', '755 544 2629', '633 181 4494', '346 706 5964', '688 690 2184', '618 717 1697', '185-321-6877', '(147) 535-3529', '(152) 912-4118', '726 943 7486', '(634) 521-4714', '670-248-0186', '(121) 509-7306', '(105) 635-5212', '(732) 168-0110', '364 834 3150', '176 508 2778', '120 941 0833', '670 902 3199', '(214) 250-8756', '(119) 975-8484', '297 484 3285', '(489) 534-6272', '(610) 716-5732', '(456) 925-4236', '743-103-7645', '432-281-3682', '(167) 144-9470', '648 685 6188', '548 191 4898', '288 110 9483', '946-558-5801', '388 744 9637', '506 463 9129', '(848) 149-5208', '(970) 908-2298', '(843) 120-5653', '306 394 8640', '170-641-3537', '860 723 5066', '814-895-6610', '(139) 727-9901', '598 735 8557', '593 895 6761', '(817) 824-3849', '508 484 9738', '719 489 4724', '503-671-4901', '275 649 8183', '968 130 7012', '290 367 6676', '499 766 9941', '538-393-2243', '(540) 362-7136', '(802) 910-1742', '845 544 4748', '784-458-8425', '(365) 217-0634', '708 500 2758', '(594) 797-7729', '982 555 9504', '477-307-3338', '744-301-1148', '389 484 8888', '739 303 6128', '175 905 9962', '(900) 462-1379', '878-636-2294', '(998) 692-1900', '994 421 8642', '304-225-5895', '931-522-5498', '838 898 2275', '718 187 7125', '943 561 8955', '341 261 6456', '(507) 483-3618', '826 266 0205', '(380) 449-7849', '589-975-0198', '657 680 8781', '501-668-7869', '224 365 8299', '747 588 1968', '(706) 836-7047', '(698) 462-6742', '748 446 1257', '525-552-4162', '635-714-8302', '114 668 2834', '404 788 2855', '673 292 2444', '136 788 1426', '564 677 3934', '213 981 7762', '453-556-0852', '423 593 4483', '820 609 7454', '131-641-1331', '824 540 9579', '(753) 726-0123', '739 737 9041', '609-332-7370', '426-182-1365', '(347) 782-5787', '898-210-6218', '938 394 0411', '904 844 1759', '658-861-4306', '(716) 184-1232', '380-105-1757', '(969) 555-0453', '249 452 4370', '443 384 8253', '711 289 2247', '343-973-0193')
phData <- c(phData, '367 650 3720', '(265) 286-5671', '148 630 8560', '413-727-2672', '1623', '(229) 604-7790', '(892) 301-0333', '148 501 5084', '564 780 8272', '524 190 0899', '463 792 2782', '(331) 472-8624', '522-286-5318', '310 719 4550', '314-360-4288', '(301) 534-5754', '822 271 5719', '(341) 473-0639', '(835) 882-3693', '465-550-6610', '388 100 1482', '(589) 194-0523', '982 842 4913', '671 913 4563', '144 468 5864', '194 344 4039', '331 747 5714', '221-190-1449', '(322) 843-0185', '676-614-9095', '(190) 975-2514', '(909) 382-3774', '956 257 9319', '362 145 8268', '570 727 3998', '225 964 9193', '227 419 9482', '949 360 7605', '(347) 896-3463', '794 939 9735', '413 754 3034', '806 730 0459', '791 195 8909', '691-318-3535', '852-386-6029', '380 682 7795', '(355) 550-1392', '247 586 4579', '235 257 1041', '(705) 456-1905', '106 756 2785', '(836) 207-8419', '(306) 552-1875', '729-102-7511', '795 583 0958', '700-431-3918', '929 632 1068', '763 906 2495', '469 976 6796', '887-657-4143', '574-438-5329', '319 127 9518', '429 960 9710', '419 646 0299', '192 343 8515', '521 336 8581', '776 367 6109', '470 367 1392', '(944) 189-7555', '998-931-4783', '362-178-6307', '(458) 404-9558', '212 286 7936', '(481) 522-1039', '(376) 611-4588', '936 193 9690', '(641) 544-6549', '797-870-7818', '693 907 5353', '332 973 4943', '929 622 9077', '649-379-5361', '(572) 748-6932', '395-892-5646', '221-628-9561', '(227) 801-6148', '549-649-1864', '342 941 0439', '701 390 9814', '(519) 573-6576', '919-342-0230', '364-759-2705', '949 543 7906', '942 732 6403', '(900) 586-1787', '308-607-9855', '764 645 5740', '472-337-8838', '791 847 7278', '128 805 3828', '365-832-0674', '123-282-3494', '285 424 4318', '452 352 1387', '129-377-8159', '222 143 3131', '162-451-0594', '(239) 325-5321', '436-422-6171', '605 284 4260', '929-102-5905', '847 507 8268', '452-811-8088', '799 143 1677', '196 756 4555', '119-444-0817', '885 454 0883', '945-998-0444', '(367) 897-7969', '163 241 9321', '594-176-5811', '(621) 874-9973', '332 963 4103', '389 318 3975', '894-593-7953', '561-266-7842', '354-958-8052', '151 921 2775', '901 140 3759', '(426) 342-7378', '333-520-4811', '(765) 191-1797', '850 914 9348', '246 272 9019', '(400) 250-0871', '504 419 9191', '(434) 725-0561', '231 863 7554', '(969) 207-3261', '939 253 9048', '879-154-4494', '577 786 2546', '(994) 688-3259', '841-717-4447')
phData <- c(phData, '397-353-6309', '558 191 4548', '905 768 2297', '348 522 2051', '307-323-6861', '663 886 2487', '274 944 6097', '(494) 308-3048', '328 307 0875', '673 524 3504', '934 721 0615', '102-957-6486', '(715) 288-8832', '794 925 8846', '637 281 4111', '(594) 448-6242', '335 802 2651', '560 699 9908', '451 163 0102', '613 800 0835', '370 453 5800', '192 507 5411', '182-227-4838', '647 126 2332', '606 125 6957', '728-404-5558', '506 812 6052', '427-665-3475', '808 739 7162', '375-978-3305', '181-708-2089', '(802) 810-5574', '242-540-4234', '105 116 9695', '(867) 891-0871', '945 144 7892', '531 895 6695', '(533) 213-4368', '894 810 2674', '943 812 6349', '476 168 4235', '931 385 6757', '(324) 188-6781', '100 378 8095', '459 572 0244', '397-362-5469', '(102) 928-7959', '(439) 568-6611', '(767) 205-0604', '(890) 548-9219', '938 982 5585', '769-472-2992', '190 204 1154', '(649) 925-8489', '321 616 1013', '(156) 837-4491', '178-232-0815', '(882) 304-9032', '828 549 6666', '280 544 4554', '183 208 5054', '(971) 548-6611', '(828) 153-5819', '203 448 1522', '900-871-9056', '406-167-1379', '(906) 850-9192', '(859) 777-8245', '641-635-8466', '807-671-6158', '(589) 270-7518', '768 529 8051', '220 660 0306', '928-179-7556', '153 756 0278', '273 829 9197', '269 463 0911', '696 984 8826', '905-903-5258', '267 332 4709', '(146) 129-5118', '(927) 747-9822', '534 216 6666', '(481) 479-7013', '971 175 2968', '(716) 777-3762', '274-863-3205', '(217) 589-0596', '928 445 5474', '858 990 5153', '731-813-2043', '563-732-6802', '145 725 4021', '931 311 5801', '(637) 782-6989', '172 990 3485', '872 325 4341', '(359) 803-9809', '152 790 8238', '330 561 9257', '437 420 7546', '495 632 4027', '(416) 788-2844', '311-305-4367', '817-400-0481', '430 723 1079', '(729) 609-4819', '(201) 737-4409', '(137) 611-3694', '226 490 8696', '123 570 8640', '665-803', '(812) 869-6263', '639 132 6386', '(194) 198-0504', '437 886 0753', '626 756 5089', '(299) 137-6993', '714 950 3364', '653-786-5985', '518 286 5956', '194 960 2145', '362-136-1153', '376-456-0697', '657 832 1189', '962-918-6117', '692 929 3592', '805-877-3887', '(739) 710-2966', '819 732 4132', '367 221 9710', '361 154 1789', '680 488 1182', '928 638 1186', '588-693-9875', '681-308-7915', '783 647 8490', '897 847 0632', '150 952 4453', '322-884-3020')
phData <- c(phData, '176-313-5403', '487 109 4196', '(477) 182-4689', '544 382 8289', '781 543 7456', '911 829 6476', '(525) 362-5532', '517 986 3426', '838 220 5397', '(687) 887-6766', '179 163 0902', '539 137 8983', '733-154-0094', '639 881 3693', '291-830-3017', '(637) 100-0509', '750 520 0167', '676 485 8963', '135 566 5090', '337 260 4996', '371 185 2377', '280 461 1386', '(603) 149-7268', '(364) 792-5553', '496-429-1314', '459 671 4698', '486-268-3312', '497-518-4050', '(535) 685-8273', '859 495 4050', '(826) 738-8316', '724-134-3870', '(554) 269-8937', '125-578-4253', '614 800 2861', '487-232-4449', '(298) 135-0900', '(392) 183-7831', '(606) 596-1029', '(384) 953-4795', '(855) 811-8811', '253-374-7102', '(419) 295-9580', '802 102 8345', '417 393 0050', '787-624-8443', '(919) 486-4251', '341 824 5322', '415 551 1608', '(392) 495-7961', '473-238-3324', '(506) 760-3043', '876-834-0624', '919 611 6170', '(146) 699-3488', '261 434 7760', '617 310 2684', '182-535-3412', '506 129 1694', '(302) 339-0791', '(446) 229-4342', '(249) 602-6985', '(150) 905-6938', '313 990 8823', '656-941-5355', '(116) 689-6617', '955 324 5981', '175 808 2189', '(896) 993-8555', '105-687-6500', '757 524 2964', '201 374 2424', '(466) 912-8401', '766 112 6143', '783-463-4865', '853-803-9900', '(347) 851-5388', '992 114 6973', '316-212-7309', '301 672 1092', '795 137 0201', '381-883-5497', '100-531-4642', '994 923 6634', '920 355 8404', '(441) 445-6532', '325 795 2455', '593 829 6250', '566-482-9004', '542 537 6770', '716 191 1741', '491-727-7162', '167-336-5660', '358 831 0725', '432 979 7292', '(205) 382-5599', '(208) 794-9612', '728 662 3934', '380-918-8572', '(905) 742-3525', '151 434 6989', '755 544 2629', '633 181 4494', '346 706 5964', '688 690 2184', '618 717 1697', '185-321-6877', '(147) 535-3529', '(152) 912-4118', '726 943 7486', '(634) 521-4714', '670-248-0186', '(121) 509-7306', '(105) 635-5212', '(732) 168-0110', '364 834 3150', '176 508 2778', '120 941 0833', '670 902 3199', '(214) 250-8756', '(119) 975-8484', '297 484 3285', '(489) 534-6272', '(610) 716-5732', '(456) 925-4236', '743-103-7645', '432-281-3682', '(167) 144-9470', '648 685 6188', '548 191 4898', '288 110 9483', '946-558-5801', '388 744 9637', '506 463 9129', '(848) 149-5208', '(970) 908-2298', '(843) 120-5653')
phData <- c(phData, '306 394 8640', '170-641-3537', '860 723 5066', '814-895-6610', '(139) 727-9901', '598 735 8557', '593 895 6761', '(817) 824-3849', '508 484 9738', '719 489 4724', '503-671-4901', '275 649 8183', '968 130 7012', '290 367 6676', '499 766 9941', '538-393-2243', '(540) 362-7136', '(802) 910-1742', '845 544 4748', '784-458-8425', '(365) 217-0634', '708 500 2758', '(594) 797-7729', '982 555 9504', '477-307-3338', '744-301-1148', '389 484 8888', '739 303 6128', '175 905 9962', '(900) 462-1379', '878-636-2294', '(998) 692-1900', '994 421 8642', '304-225-5895', '931-522-5498', '838 898 2275', '718 187 7125', '943 561 8955', '341 261 6456', '(507) 483-3618', '826 266 0205', '(380) 449-7849', '589-975-0198', '657 680 8781', '501-668-7869', '224 365 8299', '747 588 1968', '(706) 836-7047', '(698) 462-6742', '748 446 1257', '525-552-4162', '635-714-8302', '114 668 2834', '404 788 2855', '673 292 2444', '136 788 1426', '564 677 3934', '213 981 7762', '453-556-0852', '423 593 4483', '820 609 7454', '131-641-1331', '824 540 9579', '(753) 726-0123', '739 737 9041', '609-332-7370', '426-182-1365', '(347) 782-5787', '898-210-6218', '938 394 0411', '904 844 1759', '658-861-4306', '(716) 184-1232', '380-105-1757', '(969) 555-0453', '249 452 4370', '443 384 8253', '711 289 2247', '343-973-0193', '367 650 3720', '(265) 286-5671', '148 630 8560', '413-727-2672', '(162) 332-5838', '(229) 604-7790', '(892) 301-0333', '148 501 5084', '564 780 8272', '524 190 0899', '463 792 2782', '(331) 472-8624', '522-286-5318', '310 719 4550', '314-360-4288', '(301) 534-5754', '822 271 5719', '(341) 473-0639', '(835) 882-3693', '465-550-6610', '388 100 1482', '(589) 194-0523', '982 842 4913', '671 913 4563', '144 468 5864', '194 344 4039', '331 747 5714', '221-190-1449', '(322) 843-0185', '676-614-9095', '(190) 975-2514', '(909) 382-3774', '956 257 9319', '362 145 8268', '570 727 3998', '225 964 9193', '227 419 9482', '949 360 7605', '(347) 896-3463', '794 939 9735', '413 754 3034', '806 730 0459', '791 195 8909', '691-318-3535', '852-386-6029', '380 682 7795', '(355) 550-1392', '247 586 4579', '235 257 1041', '(705) 456-1905', '106 756 2785', '(836) 207-8419', '(306) 552-1875', '729-102-7511', '795 583 0958')
phData <- c(phData, '700-431-3918', '929 632 1068', '763 906 2495', '469 976 6796', '887-657-4143', '574-438-5329', '319 127 9518', '429 960 9710', '419 646 0299', '192 343 8515', '521 336 8581', '776 367 6109', '470 367 1392', '(944) 189-7555', '998-931-4783', '362-178-6307', '(458) 404-9558', '212 286 7936', '(481) 522-1039', '(376) 611-4588', '936 193 9690', '(641) 544-6549', '797-870-7818', '693 907 5353', '332 973 4943', '929 622 9077', '649-379-5361', '(572) 748-6932', '395-892-5646', '221-628-9561', '(227) 801-6148', '549-649-1864', '342 941 0439', '701 390 9814', '(519) 573-6576', '919-342-0230', '364-759-2705', '949 543 7906', '942 732 6403', '(900) 586-1787', '308-607-9855', '764 645 5740', '472-337-8838', '791 847 7278', '128 805 3828', '365-832-0674', '123-282-3494', '285 424 4318', '452 352 1387', '129-377-8159', '222 143 3131', '162-451-0594', '(239) 325-5321', '436-422-6171', '605 284 4260', '929-102-5905', '847 507 8268', '452-811-8088', '799 143 1677', '196 756 4555', '119-444-0817', '885 454 0883', '945-998-0444', '(367) 897-7969', '163 241 9321', '594-176-5811', '(621) 874-9973', '332 963 4103', '389 318 3975', '894-593-7953', '561-266-7842', '354-958-8052', '151 921 2775', '901 140 3759', '(426) 342-7378', '333-520-4811', '(765) 191-1797', '850 914 9348', '246 272 9019', '(400) 250-0871', '504 419 9191', '(434) 725-0561', '231 863 7554', '(969) 207-3261', '939 253 9048', '879-154-4494', '577 786 2546', '(994) 688-3259', '841-717-4447', '397-353-6309', '558 191 4548', '905 768 2297', '348 522 2051', '307-323-6861', '663 886 2487', '274 944 6097', '(494) 308-3048', '328 307 0875', '673 524 3504', '934 721 0615', '102-957-6486', '(715) 288-8832', '794 925 8846', '637 281 4111', '(594) 448-6242', '335 802 2651', '560 699 9908', '451 163 0102', '613 800 0835', '370 453 5800', '192 507 5411', '182-227-4838', '647 126 2332', '606 125 6957', '728-404-5558', '506 812 6052', '427-665-3475', '808 739 7162', '375-978-3305', '181-708-2089', '(802) 810-5574', '242-540-4234', '105 116 9695', '(867) 891-0871', '945 144 7892', '531 895 6695', '(533) 213-4368', '894 810 2674', '943 812 6349', '476 168 4235', '931 385 6757', '(324) 188-6781', '100 378 8095', '459 572 0244', '397-362-5469', '(102) 928-7959', '(439) 568-6611', '(767) 205-0604', '(890) 548-9219', '938 982 5585', '769-472-2992')
phData <- c(phData, '190 204 1154', '(649) 925-8489', '321 616 1013', '(156) 837-4491', '178-232-0815', '(882) 304-9032', '828 549 6666', '280 544 4554', '183 208 5054', '(971) 548-6611', '(828) 153-5819', '203 448 1522', '900-871-9056', '406-167-1379', '(906) 850-9192', '(859) 777-8245', '641-635-8466', '807-671-6158', '(589) 270-7518', '768 529 8051', '220 660 0306', '928-179-7556', '153 756 0278', '273 829 9197', '269 463 0911', '696 984 8826', '905-903-5258', '267 332 4709', '(146) 129-5118', '(927) 747-9822', '534 216 6666', '(481) 479-7013', '971 175 2968', '(716) 777-3762', '274-863-3205', '(217) 589-0596', '928 445 5474', '858 990 5153', '731-813-2043', '563-732-6802', '145 725 4021', '931 311 5801', '(637) 782-6989', '172 990 3485', '872 325 4341', '(359) 803-9809', '152 790 8238', '330 561 9257', '437 420 7546', '495 632 4027', '(416) 788-2844', '311-305-4367', '817-400-0481', '430 723 1079', '(729) 609-4819', '(201) 737-4409', '(137) 611-3694', '226 490 8696', '123 570 8640', '665 803 2453', '(812) 869-6263', '639 132 6386', '(194) 198-0504', '437 886 0753', '626 756 5089', '(299) 137-6993', '714 950 3364', '653-786-5985', '518 286 5956', '194 960 2145', '362-136-1153', '376-456-0697', '657 832 1189', '962-918-6117', '692 929 3592', '805-877-3887', '(739) 710-2966', '819 732 4132', '367 221 9710', '361 154 1789', '680 488 1182', '928 638 1186', '588-693-9875', '681-308-7915', '783 647 8490', '897 847 0632', '150 952 4453', '322-884-3020', '176-313-5403', '487 109 4196', '(477) 182-4689', '544 382 8289', '781 543 7456', '911 829 6476', '(525) 362-5532', '517 986 3426', '838 220 5397', '(687) 887-6766', '179 163 0902', '539 137 8983', '733-154-0094', '639 881 3693', '291-830-3017', '(637) 100-0509', '750 520 0167', '676 485 8963', '135 566 5090', '337 260 4996', '371 185 2377', '280 461 1386', '(603) 149-7268', '(364) 792-5553', '496-429-1314', '459 671 4698', '486-268-3312', '497-518-4050', '(535) 685-8273', '859 495 4050', '(826) 738-8316', '724-134-3870', '(554) 269-8937', '125-578-4253', '614 800 2861', '487-232-4449', '(298) 135-0900', '(392) 183-7831', '(606) 596-1029', '(384) 953-4795', '(855) 811-8811', '253-374-7102', '(419) 295-9580', '802 102 8345', '417 393 0050', '787-624-8443', '(919) 486-4251', '341 824 5322', '415 551 1608', '(392) 495-7961', '473-238-3324')
phData <- c(phData, '(506) 760-3043', '876-834-0624', '919 611 6170', '(146) 699-3488', '261 434 7760', '617 310 2684', '182-535-3412', '506 129 1694', '(302) 339-0791', '(446) 229-4342', '(249) 602-6985', '(150) 905-6938', '313 990 8823', '656-941-5355', '(116) 689-6617', '955 324 5981', '175 808 2189', '(896) 993-8555', '105-687-6500', '757 524 2964', '201 374 2424', '(466) 912-8401', '766 112 6143', '783-463-4865', '853-803-9900', '(347) 851-5388', '992 114 6973', '316-212-7309', '301 672 1092', '795 137 0201', '381-883-5497', '100-531-4642', '994 923 6634', '920 355 8404', '(441) 445-6532', '325 795 2455', '593 829 6250', '566-482-9004', '542 537 6770', '716 191 1741', '491-727-7162', '167-336-5660', '358 831 0725', '432 979 7292', '(205) 382-5599', '(208) 794-9612', '728 662 3934', '380-918-8572', '(905) 742-3525', '151 434 6989', '755 544 2629', '633 181 4494', '346 706 5964', '688 690 2184', '618 717 1697', '185-321-6877', '(147) 535-3529', '(152) 912-4118', '726 943 7486', '(634) 521-4714', '670-248-0186', '(121) 509-7306', '(105) 635-5212', '(732) 168-0110', '364 834 3150', '176 508 2778', '120 941 0833', '670 902 3199', '(214) 250-8756', '(119) 975-8484', '297 484 3285', '(489) 534-6272', '(610) 716-5732', '(456) 925-4236', '743-103-7645', '432-281-3682', '(167) 144-9470', '648 685 6188', '548 191 4898', '288 110 9483', '946-558-5801', '388 744 9637', '506 463 9129', '(848) 149-5208', '(970) 908-2298', '(843) 120-5653', '306 394 8640', '170-641-3537', '860 723 5066', '814-895-6610', '(139) 727-9901', '598 735 8557', '593 895 6761', '(817) 824-3849', '508 484 9738', '719 489 4724', '503-671-4901', '275 649 8183', '968 130 7012', '290 367 6676', '499 766 9941', '538-393-2243', '(540) 362-7136', '(802) 910-1742', '845 544 4748', '784-458-8425', '(365) 217-0634', '708 500 2758', '(594) 797-7729', '982 555 9504', '477-307-3338', '744-301-1148', '389 484 8888', '739 303 6128', '175 905 9962', '(900) 462-1379', '878-636-2294', '(998) 692-1900', '994 421 8642', '304-225-5895', '931-522-5498', '838 898 2275', '718 187 7125', '943 561 8955', '341 261 6456', '(507) 483-3618', '826 266 0205', '(380) 449-7849', '589-975-0198', '657 680 8781', '501-668-7869', '224 365 8299', '747 588 1968', '(706) 836-7047', '(698) 462-6742', '748 446 1257', '525-552-4162', '635-714-8302')
phData <- c(phData, '114 668 2834', '404 788 2855', '673 292 2444', '136 788 1426', '564 677 3934', '213 981 7762', '453-556-0852', '423 593 4483', '820 609 7454', '131-641-1331', '824 540 9579', '(753) 726-0123', '739 737 9041', '609-332-7370', '426-182-1365', '(347) 782-5787', '898-210-6218', '938 394 0411', '904 844 1759', '658-861-4306', '(716) 184-1232', '380-105-1757', '(969) 555-0453', '249 452 4370', '443 384 8253', '711 289 2247', '343-973-0193', '367 650 3720', '(265) 286-5671', '148 630 8560', '413-727-2672', '(162) 332-5838', '(229) 604-7790', '(892) 301-0333', '148 501 5084', '564 780 8272', '524 190 0899', '463 792 2782', '(331) 472-8624', '522-286-5318', '310 719 4550', '314-360-4288', '(301) 534-5754', '822 271 5719', '(341) 473-0639', '(835) 882-3693', '465-550-6610', '388 100 1482', '(589) 194-0523', '982 842 4913', '671 913 4563', '144 468 5864', '194 344 4039', '331 747 5714', '221-190-1449', '(322) 843-0185', '676-614-9095', '(190) 975-2514', '(909) 382-3774', '956 257 9319', '362 145 8268', '570 727 3998', '225 964 9193', '227 419 9482', '949 360 7605', '(347) 896-3463', '794 939 9735', '413 754 3034', '806 730 0459', '791 195 8909', '691-318-3535', '852-386-6029', '380 682 7795', '(355) 550-1392', '247 586 4579', '235 257 1041', '(705) 456-1905', '106 756 2785', '(836) 207-8419', '(306) 552-1875', '729-102-7511', '795 583 0958', '700-431-3918', '929 632 1068', '763 906 2495', '469 976 6796', '887-657-4143', '574-438-5329', '319 127 9518', '429 960 9710', '419 646 0299', '38515', '521 336 8581', '776 367 6109', '470 367 1392', '(944) 189-7555', '998-931-4783', '362-178-6307', '(458) 404-9558', '212 286 7936', '(481) 522-1039', '(376) 611-4588', '936 193 9690', '(641) 544-6549', '797-870-7818', '693 907 5353', '332 973 4943', '929 622 9077', '649-379-5361', '(572) 748-6932', '395-892-5646', '221-628-9561', '(227) 801-6148', '549-649-1864', '342 941 0439', '701 390 9814', '(519) 573-6576', '919-342-0230', '364-759-2705', '949 543 7906', '942 732 6403', '(900) 586-1787', '308-607-9855', '764 645 5740', '472-337-8838', '791 847 7278', '128 805 3828', '365-832-0674', '123-282-3494', '285 424 4318', '452 352 1387', '129-377-8159', '222 143 3131', '162-451-0594', '(239) 325-5321', '436-422-6171', '605 284 4260', '929-102-5905', '847 507 8268')
phData <- c(phData, '452-811-8088', '799 143 1677', '196 756 4555', '119-444-0817', '885 454 0883', '945-998-0444', '(367) 897-7969', '163 241 9321', '594-176-5811', '(621) 874-9973', '332 963 4103', '389 318 3975', '894-593-7953', '561-266-7842', '354-958-8052', '151 921 2775', '901 140 3759', '(426) 342-7378', '333-520-4811', '(765) 191-1797', '850 914 9348', '246 272 9019', '(400) 250-0871', '504 419 9191', '(434) 725-0561', '231 863 7554', '(969) 207-3261', '939 253 9048', '879-154-4494', '577 786 2546', '(994) 688-3259', '841-717-4447', '397-353-6309', '558 191 4548', '905 768 2297', '348 522 2051', '307-323-6861', '663 886 2487', '274 944 6097', '(494) 308-3048', '328 307 0875', '673 524 3504', '934 721 0615', '102-957-6486', '(715) 288-8832', '794 925 8846', '637 281 4111', '(594) 448-6242', '335 802 2651', '560 699 9908', '451 163 0102', '613 800 0835', '370 453 5800', '192 507 5411', '182-227-4838', '647 126 2332', '606 125 6957', '728-404-5558', '506 812 6052', '427-665-3475', '808 739 7162', '375-978-3305', '181-708-2089', '(802) 810-5574', '242-540-4234', '105 116 9695', '(867) 891-0871', '945 144 7892', '531 895 6695', '(533) 213-4368', '894 810 2674', '943 812 6349', '476 168 4235', '931 385 6757', '(324) 188-6781', '100 378 8095', '459 572 0244', '397-362-5469', '(102) 928-7959', '(439) 568-6611', '(767) 205-0604', '(890) 548-9219', '938 982 5585', '769-472-2992', '190 204 1154', '(649) 925-8489', '321 616 1013', '(156) 837-4491', '178-232-0815', '(882) 304-9032', '828 549 6666', '280 544 4554', '183 208 5054', '(971) 548-6611', '(828) 153-5819', '203 448 1522', '900-871-9056', '406-167-1379', '(906) 850-9192', '(859) 777-8245', '641-635-8466', '807-671-6158', '(589) 270-7518', '768 529 8051', '220 660 0306', '928-179-7556', '153 756 0278', '273 829 9197', '269 463 0911', '696 984 8826', '905-903-5258', '267 332 4709', '(146) 129-5118', '(927) 747-9822', '534 216 6666', '(481) 479-7013', '971 175 2968', '(716) 777-3762', '274-863-3205', '(217) 589-0596', '928 445 5474', '858 990 5153', '731-813-2043', '563-732-6802', '145 725 4021', '931 311 5801', '(637) 782-6989', '172 990 3485', '872 325 4341', '(359) 803-9809', '152 790 8238', '330 561 9257', '437 420 7546', '495 632 4027', '(416) 788-2844', '311-305-4367', '817-400-0481', '430 723 1079', '(729) 609-4819')
phData <- c(phData, '(201) 737-4409', '(137) 611-3694', '226 490 8696', '123 570 8640', '665 803 2453', '(812) 869-6263', '639 132 6386', '(194) 198-0504', '437 886 0753', '626 756 5089', '(299) 137-6993', '714 950 3364', '653-786-5985', '518 286 5956', '194 960 2145', '362-136-1153', '376-456-0697', '657 832 1189', '962-918-6117', '692 929 3592', '805-877-3887', '(739) 710-2966', '819 732 4132', '367 221 9710', '361 154 1789', '680 488 1182', '928 638 1186', '588-693-9875', '681-308-7915', '783 647 8490', '897 847 0632', '150 952 4453', '322-884-3020', '176-313-5403', '487 109 4196', '(477) 182-4689', '544 382 8289', '781 543 7456', '911 829 6476', '(525) 362-5532', '517 986 3426', '838 220 5397', '(687) 887-6766', '179 163 0902', '539 137 8983', '733-154-0094', '639 881 3693', '291-830-3017', '(637) 100-0509', '750 520 0167', '676 485 8963', '135 566 5090', '337 260 4996', '371 185 2377', '280 461 1386', '(603) 149-7268', '(364) 792-5553', '496-429-1314', '459 671 4698', '486-268-3312', '497-518-4050', '(535) 685-8273', '859 495 4050', '(826) 738-8316', '724-134-3870', '(554) 269-8937', '125-578-4253', '614 800 2861', '487-232-4449', '(298) 135-0900', '(392) 183-7831', '(606) 596-1029', '(384) 953-4795', '(855) 811-8811', '253-374-7102', '(419) 295-9580', '802 102 8345', '417 393 0050', '787-624-8443', '(919) 486-4251', '341 824 5322', '415 551 1608', '(392) 495-7961', '473-238-3324', '(506) 760-3043', '876-834-0624', '919 611 6170', '(146) 699-3488', '261 434 7760', '617 310 2684', '182-535-3412', '506 129 1694', '(302) 339-0791', '(446) 229-4342', '(249) 602-6985', '(150) 905-6938', '313 990 8823', '656-941-5355', '(116) 689-6617', '955 324 5981', '175 808 2189', '(896) 993-8555', '105-687-6500', '757 524 2964', '201 374 2424', '(466) 912-8401', '766 112 6143', '783-463-4865', '853-803-9900', '(347) 851-5388', '992 114 6973', '316-212-7309', '301 672 1092', '795 137 0201', '381-883-5497', '100-531-4642', '994 923 6634', '920 355 8404', '(441) 445-6532', '325 795 2455', '593 829 6250', '566-482-9004', '542 537 6770', '716 191 1741', '491-727-7162', '167-336-5660', '358 831 0725', '432 979 7292', '(205) 382-5599', '(208) 794-9612', '728 662 3934', '380-918-8572', '(905) 742-3525', '151 434 6989', '755 544 2629', '633 181 4494', '346 706 5964', '688 690 2184', '618 717 1697', '185-321-6877')
phData <- c(phData, '(147) 535-3529', '(152) 912-4118', '726 943 7486', '(634) 521-4714', '670-248-0186', '(121) 509-7306', '(105) 635-5212', '(732) 168-0110', '364 834 3150', '176 508 2778', '120 941 0833', '670 902 3199', '(214) 250-8756', '(119) 975-8484', '297 484 3285', '(489) 534-6272', '(610) 716-5732', '(456) 925-4236', '743-103-7645', '432-281-3682', '(167) 144-9470', '648 685 6188', '548 191 4898', '288 110 9483', '946-558-5801', '388 744 9637', '506 463 9129', '(848) 149-5208', '(970) 908-2298', '(843) 120-5653', '306 394 8640', '170-641-3537', '860 723 5066', '814-895-6610', '(139) 727-9901', '598 735 8557', '593 895 6761', '(817) 824-3849', '508 484 9738', '719 489 4724', '503-671-4901', '275 649 8183', '968 130 7012', '290 367 6676', '499 766 9941', '538-393-2243', '(540) 362-7136', '(802) 910-1742', '845 544 4748', '784-458-8425', '(365) 217-0634', '708 500 2758', '(594) 797-7729', '982 555 9504', '477-307-3338', '744-301-1148', '389 484 8888', '739 303 6128', '175 905 9962', '(900) 462-1379', '878-636-2294', '(998) 692-1900', '994 421 8642', '304-225-5895', '931-522-5498', '838 898 2275', '718 187 7125', '943 561 8955', '341 261 6456', '(507) 483-3618', '826 266 0205', '(380) 449-7849', '589-975-0198', '657 680 8781', '501-668-7869', '224 365 8299', '747 588 1968', '(706) 836-7047', '(698) 462-6742', '748 446 1257', '525-552-4162', '635-714-8302', '114 668 2834', '404 788 2855', '673 292 2444', '136 788 1426', '564 677 3934', '213 981 7762', '453-556-0852', '423 593 4483', '820 609 7454', '131-641-1331', '824 540 9579', '(753) 726-0123', '739 737 9041', '609-332-7370', '426-182-1365', '(347) 782-5787', '898-210-6218', '938 394 0411', '904 844 1759', '658-861-4306', '(716) 184-1232', '380-105-1757', '(969) 555-0453', '249 452 4370', '443 384 8253', '711 289 2247', '343-973-0193', '367 650 3720', '(265) 286-5671', '148 630 8560', '413-727-2672', '(162) 332-5838', '(229) 604-7790', '(892) 301-0333', '148 501 5084', '564 780 8272', '524 190 0899', '463 792 2782', '(331) 472-8624', '522-286-5318', '310 719 4550', '314-360-4288', '(301) 534-5754', '822 271 5719', '(341) 473-0639', '(835) 882-3693', '465-550-6610', '388 100 1482', '(589) 194-0523', '982 842 4913', '671 913 4563', '144 468 5864', '194 344 4039', '331 747 5714', '221-190-1449', '(322) 843-0185', '676-614-9095', '(190) 975-2514', '(909) 382-3774', '956 257 9319', '362 145 8268', '570 727 3998', '225 964 9193', '227 419 9482', '949 360 7605')
phData <- c(phData, '(347) 896-3463', '794 939 9735', '413 754 3034', '806 730 0459')


sfo_survey <- sfo_survey %>%
    mutate(phone=phData)
glimpse(sfo_survey)


# Filter for rows with "-" in the phone column
sfo_survey %>%
    filter(str_detect(phone, "-"))

# Filter for rows with "(" or ")" in the phone column
sfo_survey %>%
    filter(str_detect(phone, "\\(|\\)"))

# Remove parentheses from phone column
phone_no_parens <- sfo_survey$phone %>%
    # Remove "("s
    str_remove_all(fixed("(")) %>%
    # Remove ")"s
    str_remove_all(fixed(")"))

# Add phone_no_parens as column
sfo_survey %>%
    select(id, airline, destination, phone) %>%
    mutate(phone_no_parens = phone_no_parens,
           # Replace all hyphens in phone_no_parens with spaces
           phone_clean = str_replace_all(phone_no_parens, "-", " ")) %>%
    tibble::as_tibble()


# Check out the invalid numbers
sfo_survey %>%
    select(id, airline, destination, phone) %>%    
    filter(str_length(phone) != 12) %>%
    tibble::as_tibble()

# Remove rows with invalid numbers
sfo_survey %>%
    select(id, airline, destination, phone) %>%
    filter(str_length(phone) == 12) %>%
    tibble::as_tibble()

Chapter 3 - Advanced Data Problems

Uniformity:

  • Uniformity is when different units have different scales - F/C, lb/kg, USD/JPY, etc.
    • Requires research in to the context and the underlying situation to detect and correct
  • Can use lubridate to harmonize even when dates are in different formats
    • parse_date_time(myDate, order=c(“%Y-%m-%d”, “%m/%d/%y”, “%B %d, %Y”))
    • Caution that dates can be ambiguous, such as 02/04/19 - can be February 4 or April 2

Cross Field Validation:

  • Cross field validation are examples like percentages not adding up to 100%
  • Can use liubridate for validating dates
    • as.numeric(as.Date(“2019-01-01”) %–% today(), “years”) # time passed FROM origin TO today, expressed in years

Completeness:

  • Missing data is an important and common data processing problem
    • visdat::vis_miss()
  • There are three types of missingness
    • MCAR
    • MAR - systematic relationship between missingness and other observed values (name is misleading)
    • MNAR - systematic relationship between missingness and unobserved values

Example code includes:

accounts <- readRDS("./RInputFiles/ch3_1_accounts.rds")
glimpse(accounts)


# Check out the accounts data frame
head(accounts)

# Define the date formats
formats <- c("%Y-%m-%d", "%B %d, %Y")

# Convert dates to the same format
accounts <- accounts %>%
    mutate(date_opened_orig=date_opened, id=as.character(id)) %>%
    mutate(date_opened = lubridate::parse_date_time(date_opened_orig, order=formats))
str(accounts)


account_offices <- tibble::tibble(id=c('A880C79F', 'BE8222DF', '19F9E113', 'A2FE52A3', 'F6DC2C08', 'D2E55799', '53AE87EF', '3E97F253', '4AE79EA1', '2322DFB4', '645335B2', 'D5EB0F00', '1EB593F7', 'DDBA03D9', '40E4A2F4', '39132EEA', '387F8E4D', '11C3C3C0', 'C2FC91E1', 'FB8F01C1', '0128D2D0', 'BE6E4B3F', '7C6E2ECC', '02E63545', '4399C98B', '98F4CF0F', '247222A6', '420985EE', '0E3903BA', '64EF994F', 'CCF84EDB', '51C21705', 'C868C6AD', '92C237C6', '9ECEADB2', 'DF0AFE50', '5CD605B3', '402839E2', '78286CE7', '168E071B', '466CCDAA', '8DE1ECB9', 'E19FE6B5', '1240D39C', 'A7BFAA72', 'C3D24436', 'FAD92F0F', '236A1D51', 'A6DDDC4C', 'DDFD0B3D', 'D13375E9', 'AC50B796', '290319FD', 'FC71925A', '7B0F3685', 'BE411172', '58066E39', 'EA7FF83A', '14A2DDB7', '305EEAA8', '8F25E54C', '19DD73C6', 'ACB8E6AF', '91BFCC40', '86ACAF81', '77E85C14', 'C5C6B79D', '0E5B69F5', '5275B518', '17217048', 'E7496A7F', '41BBB7B4', 'F6C7ABA1', 'E699DF01', 'BACA7378', '84A4302F', 'F8A78C27', '8BADDF6A', '9FB57E68', '5C98E8F5', '6BB53C2A', 'E23F2505', '0C121914', '3627E08A', 'A94493B3', '0682E9DE', '49931170', 'A154F63B', '3690CCED', '48F5E6D8', '515FAD84', '59794264', '2038185B', '65EAC615', '6C7509C9', 'BD969A9D', 'B0CDCE3D', '33A7F03E'), 
                                  office=c('New York', 'New York', 'Tokyo', 'Tokyo', 'New York', 'Tokyo', 'Tokyo', 'Tokyo', 'Tokyo', 'New York', 'New York', 'New York', 'New York', 'Tokyo', 'New York', 'Tokyo', 'Tokyo', 'New York', 'New York', 'Tokyo', 'Tokyo', 'Tokyo', 'New York', 'New York', 'New York', 'Tokyo', 'New York', 'New York', 'New York', 'New York', 'New York', 'Tokyo', 'Tokyo', 'Tokyo', 'New York', 'Tokyo', 'New York', 'New York', 'Tokyo', 'New York', 'Tokyo', 'New York', 'New York', 'Tokyo', 'New York', 'New York', 'Tokyo', 'Tokyo', 'New York', 'Tokyo', 'Tokyo', 'Tokyo', 'New York', 'New York', 'New York', 'Tokyo', 'Tokyo', 'Tokyo', 'Tokyo', 'Tokyo', 'New York', 'Tokyo', 'New York', 'New York', 'Tokyo', 'Tokyo', 'New York', 'Tokyo', 'New York', 'Tokyo', 'New York', 'New York', 'New York', 'New York', 'New York', 'Tokyo', 'New York', 'New York', 'New York', 'New York', 'New York', 'New York', 'New York', 'New York', 'New York', 'New York', 'Tokyo', 'New York', 'New York', 'New York', 'New York', 'New York', 'New York', 'New York', 'New York', 'New York', 'New York', 'New York')
                                  )
account_offices


# Scatter plot of opening date vs total amount
accounts %>%
    ggplot(aes(x = date_opened, y = total)) +
    geom_point()

# Left join accounts to account_offices by id
accounts %>%
    left_join(account_offices, by = "id") %>%
    # Convert totals from the Tokyo office to JPY
    mutate(total_usd = ifelse(office == "Tokyo", total / 104, total)) %>%
    # Scatter plot of opening date vs total_usd
    ggplot(aes(x = date_opened, y = total_usd)) +
    geom_point()


accounts <- accounts %>%
    mutate(total=c(169305, 107460, 147088, 143243, 124568, 131113, 147846, 139575, 224409, 189524, 154001, 130920, 191989, 92473, 180547, 150115, 90410, 180003, 105722, 217068, 184421, 150769, 169814, 125117, 130421, 143211, 150372, 123125, 182668, 161141, 136128, 155684, 112818, 85362, 146153, 146635, 87921, 163416, 144704, 87826, 144051, 217975, 101936, 151556, 133790, 101584, 164241, 177759, 67962, 151696, 134083, 154916, 170178, 186281, 179102, 170096, 163708, 111526, 123163, 138632, 189126, 141275, 71359, 132859, 235901, 133348, 188424, 134488, 71665, 193377, 142669, 144229, 183440, 199603, 204271, 186737, 41164, 158203, 216352, 103200, 146394, 121614, 227729, 238104, 85975, 72832, 139614, 133800, 226595, 135435, 98190, 157964, 194662, 140191, 212089, 167238, 145240, 191839), 
           fund_A=c(85018, 64784, 64029, 63466, 21156, 79241, 38450, 11045, 68394, 66964, 68691, 69487, 75388, 32931, 82564, 26358, 7520, 84295, 25398, 69738, 82221, 49607, 82093, 50287, 58177, 84645, 69104, 59390, 47236, 89269, 33405, 53542, 17876, 72556, 40675, 67373, 8474, 59213, 72495, 21642, 19756, 67105, 39942, 18835, 56001, 58434, 70211, 20886, 5970, 30596, 28545, 54451, 54341, 89127, 81321, 86735, 59004, 86856, 49666, 20307, 72037, 72872, 10203, 67405, 79599, 20954, 61972, 88475, 16114, 45365, 8615, 26449, 82468, 84788, 87254, 86632, 7560, 25477, 86665, 28990, 29561, 59013, 86625, 60475, 48482, 15809, 83035, 42648, 70260, 29123, 6452, 68869, 20591, 20108, 58861, 10234, 62549, 80542), 
           fund_B=c(75580, 35194, 15300, 54053, 47935, 26800, 29185, 65907, 80418, 52238, 56400, 48681, 84199, 22162, 68210, 74286, 67142, 31591, 24075, 86768, 60149, 55417, 62756, 23342, 43912, 7088, 63369, 27890, 87437, 25939, 89016, 38234, 15057, 21739, 46482, 63443, 50284, 23460, 38450, 42937, 80182, 72907, 38580, 46135, 54885, 21069, 73984, 80883, 20088, 84390, 37537, 35906, 32764, 43356, 18106, 56580, 16987, 19406, 25407, 35028, 62513, 51219, 51163, 7399, 79291, 33018, 69266, 44383, 35691, 58558, 72841, 83938, 73281, 47808, 57043, 33506, 21040, 43902, 77117, 24986, 29023, 39086, 79950, 89011, 7054, 15617, 22239, 16464, 84337, 23204, 60014, 32999, 89990, 46764, 76975, 83183, 48606, 87909), 
           fund_C=c(8707, 7482, 67759, 25724, 55477, 25072, 80211, 62623, 75597, 70322, 28910, 56408, 32402, 37380, 29773, 49471, 15748, 64117, 56249, 60562, 42051, 45745, 24965, 51488, 28332, 51478, 17899, 35845, 47995, 45933, 13707, 63908, 79885, 19537, 58996, 15819, 29163, 80743, 33759, 23247, 44113, 77963, 23414, 86586, 22904, 22081, 20046, 75990, 41904, 36710, 68001, 64559, 83073, 53798, 79675, 26781, 87717, 5264, 48090, 83297, 54576, 17184, 9993, 58055, 77011, 79376, 57186, 46475, 19860, 89454, 61213, 33842, 27691, 67007, 59974, 66599, 12564, 88824, 52570, 49224, 87810, 23515, 61154, 88618, 30439, 41406, 34340, 74688, 71998, 83108, 31724, 56096, 84081, 73319, 76253, 73821, 34085, 23388), 
           acct_age=c(16, 1, 11, 14, 8, 12, 2, 0, 8, 1, 1, 18, 14, 13, 11, 7, 9, 1, 15, 18, 14, 10, 16, 4, 18, 5, 4, 11, 4, 11, 11, 3, 20, 14, 1, 15, 3, 0, 10, 6, 18, 4, 10, 8, 0, 17, 12, 0, 19, 18, 14, 3, 14, 13, 6, 3, 4, 14, 1, 1, 11, 17, 6, 6, 8, 10, 12, 1, 2, 18, 11, 15, 12, 12, 14, 16, 14, 9, 2, 5, 3, 2, 2, 11, 10, 17, 9, 19, 5, 0, 6, 12, 3, 16, 19, 12, 5, 12)
           )
str(accounts)


# Find invalid totals
accounts %>%
    # theoretical_total: sum of the three funds
    mutate(theoretical_total = fund_A + fund_B + fund_C) %>%
    # Find accounts where total doesn't match theoretical_total
    filter(total != theoretical_total)


library(lubridate)

# Find invalid acct_age
accounts %>%
  # theoretical_age: age of acct based on date_opened
  mutate(theoretical_age = floor(as.numeric(date_opened %--% today(), "years"))) %>%
  # Filter for rows where acct_age is different from theoretical_age
  filter(acct_age != theoretical_age)


accounts <- accounts %>%
    mutate(age=c(54, 36, 49, 56, 21, 47, 53, 29, 58, 53, 44, 59, 48, 34, 22, 50, 35, 20, 21, 41, 42, 28, 35, 33, 30, 50, 53, 45, 26, 39, 34, 43, 58, 45, 57, 20, 46, 33, 29, 44, 22, 27, 30, 55, 27, 46, 25, 50, 37, 53, 56, 52, 29, 32, 21, 47, 57, 56, 42, 21, 45, 56, 33, 49, 56, 35, 58, 57, 54, 26, 28, 39, 53, 28, 30, 46, 40, 56, 41, 36, 51, 45, 21, 48, 59, 46, 48, 41, 23, 59, 27, 32, 32, 23, 24, 36, 57, 20), 
           inv_amount=c(35500.5, 81921.86, 46412.27, 76563.35, NA, 93552.69, 70357.7, 14429.59, 51297.32, 15052.7, 70173.49, 12401.32, 58388.14, 44656.36, NA, 7516.33, 67961.74, NA, NA, 31620.86, 11993.35, 49090.83, 93233, 86992.74, 35476.83, 66797.81, 7282.91, 35939.08, 84432.03, 21574.21, 51478.91, 22053.26, 8145.24, 25250.82, 63332.9, NA, 84066.66, 33929.23, 44340.56, 21959.28, NA, 87882.91, 49180.36, 27532.35, 61650.12, 3216.72, NA, 61481.86, 22963.63, 35760.69, 82251.59, 81490.13, 57252.76, 30898.16, NA, 68468.28, 83364.21, 47826.51, 25759.85, NA, 4217.92, 89342.43, 20932.3, 62692.03, 55888.87, 13468.4, 24569.47, 76216.88, 45162.06, 27963.45, 48979.16, 36572.69, 32150.64, 66914.63, 20970.35, 10582.94, 90442.57, 20441.92, 31803.34, 49387.29, 14881.89, 60408.99, NA, 14585.75, 60798.23, 26166.11, 28459.96, 23714.06, NA, 50814.83, 65969.8, 31395, 77896.86, NA, NA, 9387.87, 10967.69, NA)
           )
str(accounts)


# Visualize the missing values by column
visdat::vis_miss(accounts)

accounts %>%
    # missing_inv: Is inv_amount missing?
    mutate(missing_inv = is.na(inv_amount)) %>%
    # Group by missing_inv
    group_by(missing_inv) %>%
    # Calculate mean age for each missing_inv group
    summarize(avg_age = mean(age))

# Sort by age and visualize missing vals
accounts %>%
    arrange(age) %>%
    visdat::vis_miss()


accounts <- accounts %>%
    mutate(cust_id=ifelse(rnorm(nrow(.)) < -1, NA, id), 
           acct_amount=c(44244.71, NA, NA, NA, NA, 109737.62, NA, NA, 63523.31, 38175.46, 90469.53, 53796.13, NA, 83653.09, 86028.48, 12209.84, 83127.65, 89961.77, 66947.3, 75207.99, 32891.31, 92838.44, 120512, 99771.9, 71782.2, 95038.14, 83343.18, 59678.01, NA, NA, 55976.78, 92007.12, 59700.08, 79630.02, 88440.54, 31981.36, 95352.02, 82511.24, 82084.76, 31730.19, 41942.23, NA, 86503.33, 28834.71, NA, 32220.83, 97856.46, 97833.54, 24267.02, 82058.48, NA, 109943.03, 67297.46, 82996.04, 89855.98, 96673.37, 99193.98, 84505.81, NA, NA, 84107.71, 100266.99, 98923.14, 63182.57, 95275.46, 99141.9, 59863.77, 98047.16, 83345.15, 92750.87, 73618.75, 44226.86, 99490.61, 95315.71, 52684.17, 21757.14, 250046.76, 26585.87, 64944.62, 61795.89, 35924.41, 99577.36, 87312.64, 28827.59, 89138.52, 88682.34, 34679.6, 84132.1, 75508.61, 57838.49, 70272.97, 33984.87, 92169.14, 21942.37, 74010.15, 40651.36, 27907.16, NA))
str(accounts)


# Create accounts_clean
accounts_clean <- accounts %>%
    # Filter to remove rows with missing cust_id
    filter(!is.na(cust_id), !is.na(inv_amount)) %>%
    # Add new col acct_amount_filled with replaced NAs
    mutate(acct_amount_filled = ifelse(is.na(acct_amount), 5*inv_amount, acct_amount))

# Assert that cust_id has no missing vals
assertive::assert_all_are_not_na(accounts_clean$cust_id)

# Assert that acct_amount_filled has no missing vals
assertive::assert_all_are_not_na(accounts_clean$acct_amount_filled)

Chapter 4 - Record Linkage

Comparing Strings:

  • Many ways to measure differences in values
  • Number comparisons are easy - distance or percentage or the like
  • String comparisons are more tricky, and are often based on the edit distances
  • Minimum edit distance is the fewest number of typos (edits) needed to convert one string to another
    • Depends on how edits are counted - can have addition, deletion only; or can have replacement count as a single action
    • stringdist(a, b, method=)
    • stringdist_left_join(a, b, by=, method=, max_dist=)

Generating and Comparing Strings:

  • Joins can struggle due to differences in format - for example, times listed as military vs. AM/PM vs UTC/local, etc.
  • Can use “blocking”, where partial matches are only considered for matches on the blocking variable
    • pair_blocking(df_A, df_B, blocking_var = “state”) %>% compare_pairs(by = “name”, default_comparator = lcs())

Scoring and Linking:

  • Can add the scores together to get the full matching score
    • score_simsum()
  • Can use probabilistic scoring to give different weights to each variable
    • score_problink()
  • Can select the matches with the highest scores
    • select_n_to_m()
  • Can link the frames together using the link() function
    • link()

Wrap Up:

  • Diagnose dirty data, side effects of dirty data, cleaning process for dirty data
  • Data type constraints, data range constraints, uniqueness constraints
  • Membership constraints, categorical variables, cleaning text data
  • Uniformity, cross-field validation, missing data
  • Record linkage for when traditional joins cannot be used

Example code includes:

zagat <- readRDS("./RInputFiles/zagat.rds")
glimpse(zagat)

fodors <- readRDS("./RInputFiles/fodors.rds")
glimpse(fodors)


# Calculate Damerau-Levenshtein distance
stringdist::stringdist("las angelos", "los angeles", method = "dl")

# Calculate Damerau-Levenshtein distance
stringdist::stringdist("las angelos", "los angeles", method = "lcs")

# Calculate Damerau-Levenshtein distance
stringdist::stringdist("las angelos", "los angeles", method = "jaccard")


cities <- tibble::tibble(city_actual=c("new york", "los angeles", "atlanta", "san francisco", "las vegas"))
cities


# Count the number of each city variation
zagat %>%
    count(city)

# Join zagat and cities and look at results
zagat %>%
    # Left join based on stringdist using city and city_actual cols
    fuzzyjoin::stringdist_left_join(cities, by = c("city"="city_actual")) %>%
    # Select the name, city, and city_actual cols
    select(name, city, city_actual)


zagat <- zagat %>%
    mutate(city=as.character(city))
fodors <- fodors %>%
    mutate(city=as.character(city))

# Generate all possible pairs
reclin::pair_blocking(zagat, fodors)

# Generate pairs with same city
reclin::pair_blocking(zagat, fodors, blocking_var="city")


# Generate pairs
reclin::pair_blocking(zagat, fodors, blocking_var = "city") %>%
    # Compare pairs by name using lcs()
    reclin::compare_pairs(by = "name", default_comparator = reclin::lcs())

# Generate pairs
reclin::pair_blocking(zagat, fodors, blocking_var = "city") %>%
    # Compare pairs by name, phone, addr
    reclin::compare_pairs(by=c("name", "phone", "addr"), default_comparator=reclin::jaro_winkler())


# Create pairs
recPairs <- reclin::pair_blocking(zagat, fodors, blocking_var = "city") %>%
    # Compare pairs
    reclin::compare_pairs(by = "name", default_comparator = reclin::jaro_winkler()) %>%
    # Score pairs
    reclin::score_problink() %>%
    # Select pairs
    reclin::select_n_to_m() %>%
    # Link data 
    reclin::link()

dim(recPairs)
head(recPairs)
tail(recPairs)

Handling Missing Data With Imputations in R

Chapter 1 - Problem of Missing Data